From ab51283cbb3925c89436ea0ab8693a2ffb631524 Mon Sep 17 00:00:00 2001 From: Aleksandar Spasojevic Date: Tue, 10 Jun 2025 17:17:39 +0200 Subject: [PATCH 1/3] [AMDGPU] Optimize rotate instruction selection patterns This patch improves rotate instruction selection for AMDGPU by adding optimized patterns for the rotate right (rotr) operation. It now selects s_lshl + s_lshr + s_or (3 SALU instructions) instead of the previous v_alignbit + v_readfirstlane (2 VALU instructions). --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 17 ++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 25 ++ .../Target/AMDGPU/AMDGPUInstructionSelector.h | 3 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 7 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 + llvm/lib/Target/AMDGPU/SIInstructions.td | 6 + .../AMDGPU/GlobalISel/legalize-rotl-rotr.mir | 28 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 215 +++++++++---- llvm/test/CodeGen/AMDGPU/rotr.ll | 285 ++++++++++++++---- 12 files changed, 473 insertions(+), 127 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index bb4bf742fb861..4af559b9ae953 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -95,6 +95,10 @@ def gi_vinterpmods_hi : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_immsub : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods? def gi_vop3opsel : GIComplexOperandMatcher, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ac0cb549d020b..9d7f2c1a2cb4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -4001,6 +4001,23 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src, } IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); +bool AMDGPUDAGToDAGISel::SelectImmSub(SDValue In, SDValue &Src, + SDValue &InvSrc) const { + Src = In; + + // Handle constant operands + ConstantSDNode *ImmVal = dyn_cast(In); + if (ImmVal) + InvSrc = CurDAG->getTargetConstant(32 - ImmVal->getZExtValue(), SDLoc(In), + MVT::i32); + else { + // Fallback: generate SUB instruction for non-constant, non-negation cases + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::S_SUB_U32, SDLoc(In), MVT::i32, + {CurDAG->getTargetConstant(32, SDLoc(In), MVT::i32), In}); + InvSrc = SDValue(VMov, 0); + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index a86b75458923e..116d978839807 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -263,6 +263,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; + bool SelectImmSub(SDValue In, SDValue &Src, SDValue &InvSrc) const; + SDValue getHi16Elt(SDValue In) const; SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 650df2a87506a..89c02b75919eb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -6660,6 +6660,31 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectImmSub(MachineOperand &Root) const { + + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + Register SrcInv = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + + // Handle constant operands + std::optional Val = getConstantZext32Val(Root.getReg(), *MRI); + + if (!Val) { + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U32), SrcInv) + .addImm(32) + .add(Root); + } else { + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SrcInv) + .addImm(32 - *Val); + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(SrcInv); }, + }}; +} + std::pair AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, bool &Matched) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c760fe7ef99dd..c70f0f287ba3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -227,6 +227,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectScaleOffset(MachineOperand &Root, Register &Offset, bool IsSigned) const; + InstructionSelector::ComplexRendererFns + selectImmSub(MachineOperand &Root) const; + bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, int64_t *Offset, bool *ScaleOffset) const; InstructionSelector::ComplexRendererFns diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1a13b2226ecd6..c3aaaebb7b532 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2094,7 +2094,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, S32, S64) .lower(); - getActionDefinitionsBuilder({G_ROTR, G_ROTL}) + getActionDefinitionsBuilder(G_ROTR) + .legalFor({S32}) + .scalarize(0) + .lower(); + + getActionDefinitionsBuilder(G_ROTL) .scalarize(0) .lower(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7ed026ee5f69e..e16bfe3610a77 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4131,6 +4131,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_SMED3: case AMDGPU::G_AMDGPU_FMED3: return getDefaultMappingVOP(MI); + case AMDGPU::G_ROTR: + case AMDGPU::G_ROTL: { + if (isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); + } case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 0125580fc28bd..aa1b7c24129f6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1702,6 +1702,8 @@ def VOP3PMadMixBF16Mods : ComplexPattern; def VINTERPModsHi : ComplexPattern; +def ImmSub : ComplexPattern; + //===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6cc9b3cc67530..e2d6f1c3f73f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2672,6 +2672,12 @@ def : AMDGPUPat < $src1), sub1) >; +// rotr pattern +def : AMDGPUPat < + (UniformBinFrag i32:$src0, (i32 (ImmSub i32:$src1, i32:$src1_inv))), + (S_OR_B32 (S_LSHR_B32 i32:$src0, i32:$src1), (S_LSHL_B32 i32:$src0, i32:$src1_inv)) +>; + let True16Predicate = NotHasTrue16BitInsts in { let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir index 7fdee12315754..9610caa1f2012 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir @@ -181,8 +181,8 @@ body: | ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]] - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32) - ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32) + ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[SUB]](s32) + ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = G_ROTL %0, %1(s32) @@ -301,14 +301,14 @@ body: | ; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]] - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32) + ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[SUB]](s32) ; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]] - ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32) + ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[SUB1]](s32) ; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]] - ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32) + ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[SUB2]](s32) ; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]] - ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32) - ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32) + ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[SUB3]](s32) + ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32) ; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7 @@ -391,8 +391,8 @@ body: | ; GFX-NEXT: {{ $}} ; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32) - ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32) + ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32) + ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = G_ROTR %0, %1(s32) @@ -452,11 +452,11 @@ body: | ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32) - ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32) - ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32) - ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32) - ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32) + ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[UV4]](s32) + ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[UV5]](s32) + ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[UV6]](s32) + ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[UV7]](s32) + ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32) ; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 0a1d15bf945f9..423a2dcf88090 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -26,11 +26,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s3 +; SI-NEXT: s_sub_u32 s4, 32, s3 +; SI-NEXT: s_lshr_b32 s3, s2, s3 +; SI-NEXT: s_lshl_b32 s2, s2, s4 +; SI-NEXT: s_or_b32 s2, s3, s2 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -39,10 +42,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 +; GFX8-NEXT: s_sub_u32 s4, 32, s3 +; GFX8-NEXT: s_lshr_b32 s3, s2, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -52,18 +58,26 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: s_sub_u32 s4, 32, s3 +; GFX10-NEXT: s_lshr_b32 s3, s2, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_u32 s4, 32, s3 +; GFX11-NEXT: s_lshr_b32 s3, s2, s3 +; GFX11-NEXT: s_lshl_b32 s2, s2, s4 +; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -97,14 +111,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 ; SI-NEXT: s_sub_i32 s2, 32, s2 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_sub_i32 s3, 32, s3 +; SI-NEXT: s_sub_u32 s6, 32, s2 +; SI-NEXT: s_sub_u32 s8, 32, s3 +; SI-NEXT: s_lshr_b32 s3, s1, s3 +; SI-NEXT: s_lshr_b32 s2, s0, s2 +; SI-NEXT: s_lshl_b32 s1, s1, s8 +; SI-NEXT: s_lshl_b32 s0, s0, s6 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -115,11 +135,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, 32, s2 ; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: s_sub_u32 s6, 32, s2 +; GFX8-NEXT: s_sub_u32 s7, 32, s3 +; GFX8-NEXT: s_lshr_b32 s3, s1, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s7 +; GFX8-NEXT: s_lshr_b32 s2, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -131,10 +157,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 ; GFX10-NEXT: s_sub_i32 s2, 32, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: s_sub_i32 s3, 32, s3 +; GFX10-NEXT: s_sub_u32 s4, 32, s2 +; GFX10-NEXT: s_sub_u32 s5, 32, s3 +; GFX10-NEXT: s_lshr_b32 s3, s1, s3 +; GFX10-NEXT: s_lshr_b32 s2, s0, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -143,12 +177,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s3, 32, s3 ; GFX11-NEXT: s_sub_i32 s2, 32, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_sub_u32 s6, 32, s2 +; GFX11-NEXT: s_sub_u32 s7, 32, s3 +; GFX11-NEXT: s_lshr_b32 s3, s1, s3 +; GFX11-NEXT: s_lshr_b32 s2, s0, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s1, s1, s7 +; GFX11-NEXT: s_or_b32 s0, s2, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -188,20 +230,32 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s4, 32, s12 -; SI-NEXT: s_sub_i32 s5, 32, s13 +; SI-NEXT: s_sub_i32 s2, 32, s12 +; SI-NEXT: s_sub_i32 s4, 32, s13 +; SI-NEXT: s_sub_i32 s5, 32, s14 ; SI-NEXT: s_sub_i32 s6, 32, s15 -; SI-NEXT: s_sub_i32 s7, 32, s14 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; SI-NEXT: s_sub_u32 s7, 32, s2 +; SI-NEXT: s_sub_u32 s12, 32, s4 +; SI-NEXT: s_sub_u32 s13, 32, s5 +; SI-NEXT: s_sub_u32 s14, 32, s6 +; SI-NEXT: s_lshr_b32 s6, s11, s6 +; SI-NEXT: s_lshr_b32 s5, s10, s5 +; SI-NEXT: s_lshr_b32 s4, s9, s4 +; SI-NEXT: s_lshr_b32 s2, s8, s2 +; SI-NEXT: s_lshl_b32 s11, s11, s14 +; SI-NEXT: s_lshl_b32 s10, s10, s13 +; SI-NEXT: s_lshl_b32 s9, s9, s12 +; SI-NEXT: s_lshl_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s11 +; SI-NEXT: s_or_b32 s5, s5, s10 +; SI-NEXT: s_or_b32 s4, s4, s9 +; SI-NEXT: s_or_b32 s7, s2, s7 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -210,19 +264,31 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s5, 32, s15 -; GFX8-NEXT: s_sub_i32 s4, 32, s14 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s3, 32, s13 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_sub_i32 s2, 32, s12 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_sub_i32 s3, 32, s13 +; GFX8-NEXT: s_sub_i32 s4, 32, s14 +; GFX8-NEXT: s_sub_i32 s12, 32, s15 +; GFX8-NEXT: s_sub_u32 s5, 32, s2 +; GFX8-NEXT: s_sub_u32 s6, 32, s3 +; GFX8-NEXT: s_sub_u32 s7, 32, s4 +; GFX8-NEXT: s_sub_u32 s13, 32, s12 +; GFX8-NEXT: s_lshr_b32 s12, s11, s12 +; GFX8-NEXT: s_lshl_b32 s11, s11, s13 +; GFX8-NEXT: s_lshr_b32 s4, s10, s4 +; GFX8-NEXT: s_lshl_b32 s7, s10, s7 +; GFX8-NEXT: s_lshr_b32 s3, s9, s3 +; GFX8-NEXT: s_lshl_b32 s6, s9, s6 +; GFX8-NEXT: s_lshr_b32 s2, s8, s2 +; GFX8-NEXT: s_lshl_b32 s5, s8, s5 +; GFX8-NEXT: s_or_b32 s11, s12, s11 +; GFX8-NEXT: s_or_b32 s4, s4, s7 +; GFX8-NEXT: s_or_b32 s3, s3, s6 +; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -236,12 +302,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s2, 32, s12 ; GFX10-NEXT: s_sub_i32 s3, 32, s13 -; GFX10-NEXT: s_sub_i32 s4, 32, s15 -; GFX10-NEXT: s_sub_i32 s5, 32, s14 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2 +; GFX10-NEXT: s_sub_i32 s4, 32, s14 +; GFX10-NEXT: s_sub_i32 s5, 32, s15 +; GFX10-NEXT: s_sub_u32 s6, 32, s2 +; GFX10-NEXT: s_sub_u32 s7, 32, s3 +; GFX10-NEXT: s_sub_u32 s12, 32, s4 +; GFX10-NEXT: s_sub_u32 s13, 32, s5 +; GFX10-NEXT: s_lshr_b32 s5, s11, s5 +; GFX10-NEXT: s_lshr_b32 s4, s10, s4 +; GFX10-NEXT: s_lshr_b32 s3, s9, s3 +; GFX10-NEXT: s_lshr_b32 s2, s8, s2 +; GFX10-NEXT: s_lshl_b32 s11, s11, s13 +; GFX10-NEXT: s_lshl_b32 s10, s10, s12 +; GFX10-NEXT: s_lshl_b32 s7, s9, s7 +; GFX10-NEXT: s_lshl_b32 s6, s8, s6 +; GFX10-NEXT: s_or_b32 s5, s5, s11 +; GFX10-NEXT: s_or_b32 s4, s4, s10 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -250,16 +332,31 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s12 ; GFX11-NEXT: s_sub_i32 s3, 32, s13 -; GFX11-NEXT: s_sub_i32 s4, 32, s15 -; GFX11-NEXT: s_sub_i32 s5, 32, s14 -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2 +; GFX11-NEXT: s_sub_i32 s4, 32, s14 +; GFX11-NEXT: s_sub_i32 s5, 32, s15 +; GFX11-NEXT: s_sub_u32 s6, 32, s2 +; GFX11-NEXT: s_sub_u32 s7, 32, s3 +; GFX11-NEXT: s_sub_u32 s12, 32, s4 +; GFX11-NEXT: s_sub_u32 s13, 32, s5 +; GFX11-NEXT: s_lshr_b32 s5, s11, s5 +; GFX11-NEXT: s_lshr_b32 s4, s10, s4 +; GFX11-NEXT: s_lshr_b32 s3, s9, s3 +; GFX11-NEXT: s_lshr_b32 s2, s8, s2 +; GFX11-NEXT: s_lshl_b32 s11, s11, s13 +; GFX11-NEXT: s_lshl_b32 s10, s10, s12 +; GFX11-NEXT: s_lshl_b32 s7, s9, s7 +; GFX11-NEXT: s_lshl_b32 s6, s8, s6 +; GFX11-NEXT: s_or_b32 s5, s5, s11 +; GFX11-NEXT: s_or_b32 s4, s4, s10 +; GFX11-NEXT: s_or_b32 s2, s2, s6 +; GFX11-NEXT: s_or_b32 s3, s3, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 403a556688091..2ec5e6c893a98 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -1,10 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; R600-LABEL: rotr_i32: @@ -22,12 +23,15 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_sub_u32 s4, 32, s3 +; SI-NEXT: s_lshr_b32 s3, s2, s3 +; SI-NEXT: s_lshl_b32 s2, s2, s4 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -35,10 +39,13 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 +; GFX8-NEXT: s_sub_u32 s4, 32, s3 +; GFX8-NEXT: s_lshr_b32 s3, s2, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -47,18 +54,39 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: s_sub_u32 s4, 32, s3 +; GFX10-NEXT: s_lshr_b32 s3, s2, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX11-NEXT: s_sub_u32 s4, 32, s3 +; GFX11-NEXT: s_lshr_b32 s3, s2, s3 +; GFX11-NEXT: s_lshl_b32 s2, s2, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rotr_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_u32 s4, 32, s3 +; GFX12-NEXT: s_lshr_b32 s3, s2, s3 +; GFX12-NEXT: s_lshl_b32 s2, s2, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s2, s3, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = sub i32 32, %y %tmp1 = shl i32 %x, %tmp0 @@ -86,12 +114,18 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_sub_u32 s6, 32, s2 +; SI-NEXT: s_sub_u32 s8, 32, s3 +; SI-NEXT: s_lshr_b32 s3, s1, s3 +; SI-NEXT: s_lshr_b32 s2, s0, s2 +; SI-NEXT: s_lshl_b32 s1, s1, s8 +; SI-NEXT: s_lshl_b32 s0, s0, s6 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -100,11 +134,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: s_sub_u32 s6, 32, s2 +; GFX8-NEXT: s_sub_u32 s7, 32, s3 +; GFX8-NEXT: s_lshr_b32 s3, s1, s3 +; GFX8-NEXT: s_lshr_b32 s2, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s7 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -116,8 +156,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: s_sub_u32 s4, 32, s2 +; GFX10-NEXT: s_sub_u32 s5, 32, s3 +; GFX10-NEXT: s_lshr_b32 s3, s1, s3 +; GFX10-NEXT: s_lshr_b32 s2, s0, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -126,12 +174,40 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: s_sub_u32 s6, 32, s2 +; GFX11-NEXT: s_sub_u32 s7, 32, s3 +; GFX11-NEXT: s_lshr_b32 s3, s1, s3 +; GFX11-NEXT: s_lshr_b32 s2, s0, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s1, s1, s7 +; GFX11-NEXT: s_or_b32 s0, s2, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rotr_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_u32 s6, 32, s2 +; GFX12-NEXT: s_sub_co_u32 s7, 32, s3 +; GFX12-NEXT: s_lshr_b32 s3, s1, s3 +; GFX12-NEXT: s_lshr_b32 s2, s0, s2 +; GFX12-NEXT: s_lshl_b32 s0, s0, s6 +; GFX12-NEXT: s_lshl_b32 s1, s1, s7 +; GFX12-NEXT: s_or_b32 s0, s2, s0 +; GFX12-NEXT: s_or_b32 s1, s3, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm entry: %tmp0 = sub <2 x i32> , %y %tmp1 = shl <2 x i32> %x, %tmp0 @@ -161,16 +237,28 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: s_sub_u32 s2, 32, s12 +; SI-NEXT: s_sub_u32 s4, 32, s13 +; SI-NEXT: s_sub_u32 s5, 32, s14 +; SI-NEXT: s_sub_u32 s6, 32, s15 +; SI-NEXT: s_lshr_b32 s7, s11, s15 +; SI-NEXT: s_lshr_b32 s14, s10, s14 +; SI-NEXT: s_lshr_b32 s13, s9, s13 +; SI-NEXT: s_lshr_b32 s12, s8, s12 +; SI-NEXT: s_lshl_b32 s6, s11, s6 +; SI-NEXT: s_lshl_b32 s5, s10, s5 +; SI-NEXT: s_lshl_b32 s4, s9, s4 +; SI-NEXT: s_lshl_b32 s2, s8, s2 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_or_b32 s4, s13, s4 +; SI-NEXT: s_or_b32 s7, s12, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -179,15 +267,27 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s15 -; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: s_sub_u32 s5, 32, s15 +; GFX8-NEXT: s_sub_u32 s4, 32, s14 +; GFX8-NEXT: s_lshr_b32 s6, s11, s15 +; GFX8-NEXT: s_lshl_b32 s5, s11, s5 +; GFX8-NEXT: s_sub_u32 s3, 32, s13 +; GFX8-NEXT: s_or_b32 s5, s6, s5 +; GFX8-NEXT: s_lshr_b32 s6, s10, s14 +; GFX8-NEXT: s_lshl_b32 s4, s10, s4 +; GFX8-NEXT: s_sub_u32 s2, 32, s12 +; GFX8-NEXT: s_or_b32 s4, s6, s4 +; GFX8-NEXT: s_lshr_b32 s6, s9, s13 +; GFX8-NEXT: s_lshl_b32 s3, s9, s3 +; GFX8-NEXT: s_or_b32 s3, s6, s3 +; GFX8-NEXT: s_lshr_b32 s6, s8, s12 +; GFX8-NEXT: s_lshl_b32 s2, s8, s2 +; GFX8-NEXT: s_or_b32 s2, s6, s2 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -199,10 +299,26 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s15 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s14 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s13 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s12 +; GFX10-NEXT: s_sub_u32 s2, 32, s12 +; GFX10-NEXT: s_sub_u32 s3, 32, s13 +; GFX10-NEXT: s_sub_u32 s4, 32, s14 +; GFX10-NEXT: s_sub_u32 s5, 32, s15 +; GFX10-NEXT: s_lshr_b32 s6, s11, s15 +; GFX10-NEXT: s_lshr_b32 s7, s10, s14 +; GFX10-NEXT: s_lshr_b32 s13, s9, s13 +; GFX10-NEXT: s_lshr_b32 s12, s8, s12 +; GFX10-NEXT: s_lshl_b32 s5, s11, s5 +; GFX10-NEXT: s_lshl_b32 s4, s10, s4 +; GFX10-NEXT: s_lshl_b32 s3, s9, s3 +; GFX10-NEXT: s_lshl_b32 s2, s8, s2 +; GFX10-NEXT: s_or_b32 s5, s6, s5 +; GFX10-NEXT: s_or_b32 s4, s7, s4 +; GFX10-NEXT: s_or_b32 s2, s12, s2 +; GFX10-NEXT: s_or_b32 s3, s13, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -211,14 +327,58 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s15 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s14 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s13 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s12 +; GFX11-NEXT: s_sub_u32 s2, 32, s12 +; GFX11-NEXT: s_sub_u32 s3, 32, s13 +; GFX11-NEXT: s_sub_u32 s4, 32, s14 +; GFX11-NEXT: s_sub_u32 s5, 32, s15 +; GFX11-NEXT: s_lshr_b32 s6, s11, s15 +; GFX11-NEXT: s_lshr_b32 s7, s10, s14 +; GFX11-NEXT: s_lshr_b32 s13, s9, s13 +; GFX11-NEXT: s_lshr_b32 s12, s8, s12 +; GFX11-NEXT: s_lshl_b32 s5, s11, s5 +; GFX11-NEXT: s_lshl_b32 s4, s10, s4 +; GFX11-NEXT: s_lshl_b32 s3, s9, s3 +; GFX11-NEXT: s_lshl_b32 s2, s8, s2 +; GFX11-NEXT: s_or_b32 s5, s6, s5 +; GFX11-NEXT: s_or_b32 s4, s7, s4 +; GFX11-NEXT: s_or_b32 s2, s12, s2 +; GFX11-NEXT: s_or_b32 s3, s13, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rotr_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_u32 s2, 32, s12 +; GFX12-NEXT: s_sub_co_u32 s3, 32, s13 +; GFX12-NEXT: s_sub_co_u32 s4, 32, s14 +; GFX12-NEXT: s_sub_co_u32 s5, 32, s15 +; GFX12-NEXT: s_lshr_b32 s6, s11, s15 +; GFX12-NEXT: s_lshr_b32 s7, s10, s14 +; GFX12-NEXT: s_lshr_b32 s13, s9, s13 +; GFX12-NEXT: s_lshr_b32 s12, s8, s12 +; GFX12-NEXT: s_lshl_b32 s5, s11, s5 +; GFX12-NEXT: s_lshl_b32 s4, s10, s4 +; GFX12-NEXT: s_lshl_b32 s3, s9, s3 +; GFX12-NEXT: s_lshl_b32 s2, s8, s2 +; GFX12-NEXT: s_or_b32 s5, s6, s5 +; GFX12-NEXT: s_or_b32 s4, s7, s4 +; GFX12-NEXT: s_or_b32 s2, s12, s2 +; GFX12-NEXT: s_or_b32 s3, s13, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = sub <4 x i32> , %y %tmp1 = shl <4 x i32> %x, %tmp0 @@ -484,6 +644,25 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-FAKE16-NEXT: global_store_b16 v[4:5], v0, off offset:8 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_rotr_i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v2, v[2:3], off offset:48 +; GFX12-NEXT: global_load_u16 v0, v[0:1], off offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: v_sub_nc_u16 v1, 0, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b16 v2, v2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b16 v0, v1, v0 +; GFX12-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX12-NEXT: global_store_b16 v[4:5], v0, off offset:8 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16 %a = load i16, ptr addrspace(1) %arrayidx From c28a11ed9f934e47c4069ad50dc2eb1991cb72dc Mon Sep 17 00:00:00 2001 From: Aleksandar Spasojevic Date: Wed, 11 Jun 2025 10:53:44 +0200 Subject: [PATCH 2/3] clang-format correction --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c3aaaebb7b532..b26121ccd980f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2094,14 +2094,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, S32, S64) .lower(); - getActionDefinitionsBuilder(G_ROTR) - .legalFor({S32}) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder(G_ROTR).legalFor({S32}).scalarize(0).lower(); - getActionDefinitionsBuilder(G_ROTL) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder(G_ROTL).scalarize(0).lower(); auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR); FSHRActionDefs.legalFor({{S32, S32}}) From 4899180e23757844893f365f9c58dccb6e438707 Mon Sep 17 00:00:00 2001 From: Aleksandar Spasojevic Date: Thu, 20 Nov 2025 14:22:58 +0100 Subject: [PATCH 3/3] First approach implementation --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 - llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 17 - llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 - .../AMDGPU/AMDGPUInstructionSelector.cpp | 25 - .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 - llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 - llvm/lib/Target/AMDGPU/SIInstructions.td | 10 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 294 +++++---- llvm/test/CodeGen/AMDGPU/rotr.ll | 559 +++++++++++------- 9 files changed, 483 insertions(+), 432 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 4af559b9ae953..bb4bf742fb861 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -95,10 +95,6 @@ def gi_vinterpmods_hi : GIComplexOperandMatcher, GIComplexPatternEquiv; -def gi_immsub : - GIComplexOperandMatcher, - GIComplexPatternEquiv; - // FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods? def gi_vop3opsel : GIComplexOperandMatcher, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 9d7f2c1a2cb4b..ac0cb549d020b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -4001,23 +4001,6 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src, } IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); -bool AMDGPUDAGToDAGISel::SelectImmSub(SDValue In, SDValue &Src, - SDValue &InvSrc) const { - Src = In; - - // Handle constant operands - ConstantSDNode *ImmVal = dyn_cast(In); - if (ImmVal) - InvSrc = CurDAG->getTargetConstant(32 - ImmVal->getZExtValue(), SDLoc(In), - MVT::i32); - else { - // Fallback: generate SUB instruction for non-constant, non-negation cases - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::S_SUB_U32, SDLoc(In), MVT::i32, - {CurDAG->getTargetConstant(32, SDLoc(In), MVT::i32), In}); - InvSrc = SDValue(VMov, 0); - } - return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 116d978839807..a86b75458923e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -263,8 +263,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; - bool SelectImmSub(SDValue In, SDValue &Src, SDValue &InvSrc) const; - SDValue getHi16Elt(SDValue In) const; SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 89c02b75919eb..650df2a87506a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -6660,31 +6660,6 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectImmSub(MachineOperand &Root) const { - - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - Register SrcInv = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - - // Handle constant operands - std::optional Val = getConstantZext32Val(Root.getReg(), *MRI); - - if (!Val) { - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U32), SrcInv) - .addImm(32) - .add(Root); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SrcInv) - .addImm(32 - *Val); - } - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(SrcInv); }, - }}; -} - std::pair AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, bool &Matched) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c70f0f287ba3e..c705936b43ab5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -227,8 +227,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectScaleOffset(MachineOperand &Root, Register &Offset, bool IsSigned) const; - InstructionSelector::ComplexRendererFns - selectImmSub(MachineOperand &Root) const; bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, int64_t *Offset, bool *ScaleOffset) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index aa1b7c24129f6..0125580fc28bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1702,8 +1702,6 @@ def VOP3PMadMixBF16Mods : ComplexPattern; def VINTERPModsHi : ComplexPattern; -def ImmSub : ComplexPattern; - //===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e2d6f1c3f73f4..3769e5cb3b891 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2672,10 +2672,12 @@ def : AMDGPUPat < $src1), sub1) >; -// rotr pattern -def : AMDGPUPat < - (UniformBinFrag i32:$src0, (i32 (ImmSub i32:$src1, i32:$src1_inv))), - (S_OR_B32 (S_LSHR_B32 i32:$src0, i32:$src1), (S_LSHL_B32 i32:$src0, i32:$src1_inv)) +def : GCNPat<(UniformBinFrag i32:$src0, i32:$src1), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src0, sub0, $src0, sub1), (S_AND_B32 $src1, (i32 31))), sub0)) +>; + +def : GCNPat<(UniformBinFrag i32:$src0, (i32 ShiftAmt32Imm:$src1)), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src0, sub0, $src0, sub1), $src1), sub0)) >; let True16Predicate = NotHasTrue16BitInsts in { diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 423a2dcf88090..a7fcb6439703a 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -25,11 +25,10 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_sub_u32 s4, 32, s3 -; SI-NEXT: s_lshr_b32 s3, s2, s3 -; SI-NEXT: s_lshl_b32 s2, s2, s4 -; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_sub_i32 s4, 32, s3 +; SI-NEXT: s_mov_b32 s3, s2 +; SI-NEXT: s_and_b32 s4, s4, 31 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 @@ -41,11 +40,10 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: s_sub_u32 s4, 32, s3 -; GFX8-NEXT: s_lshr_b32 s3, s2, s3 -; GFX8-NEXT: s_lshl_b32 s2, s2, s4 -; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s4, 32, s3 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s4, s4, 31 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -58,10 +56,9 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: s_sub_u32 s4, 32, s3 -; GFX10-NEXT: s_lshr_b32 s3, s2, s3 -; GFX10-NEXT: s_lshl_b32 s2, s2, s4 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s4, s3, 31 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -71,11 +68,10 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_u32 s4, 32, s3 -; GFX11-NEXT: s_lshr_b32 s3, s2, s3 -; GFX11-NEXT: s_lshl_b32 s2, s2, s4 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s4, s3, 31 +; GFX11-NEXT: s_mov_b32 s3, s2 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -112,19 +108,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s2, 32, s2 -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_sub_u32 s6, 32, s2 -; SI-NEXT: s_sub_u32 s8, 32, s3 -; SI-NEXT: s_lshr_b32 s3, s1, s3 -; SI-NEXT: s_lshr_b32 s2, s0, s2 -; SI-NEXT: s_lshl_b32 s1, s1, s8 -; SI-NEXT: s_lshl_b32 s0, s0, s6 -; SI-NEXT: s_or_b32 s1, s3, s1 -; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_sub_i32 s6, 32, s2 +; SI-NEXT: s_sub_i32 s8, 32, s3 +; SI-NEXT: s_mov_b32 s2, s1 +; SI-NEXT: s_mov_b32 s3, s1 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: s_and_b32 s8, s8, 31 +; SI-NEXT: s_and_b32 s6, s6, 31 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -133,19 +128,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 32, s2 -; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: s_sub_u32 s6, 32, s2 -; GFX8-NEXT: s_sub_u32 s7, 32, s3 -; GFX8-NEXT: s_lshr_b32 s3, s1, s3 -; GFX8-NEXT: s_lshl_b32 s1, s1, s7 -; GFX8-NEXT: s_lshr_b32 s2, s0, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_or_b32 s1, s3, s1 -; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_sub_i32 s7, 32, s3 +; GFX8-NEXT: s_sub_i32 s6, 32, s2 +; GFX8-NEXT: s_mov_b32 s2, s1 +; GFX8-NEXT: s_mov_b32 s3, s1 +; GFX8-NEXT: s_and_b32 s1, s7, 31 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s1 +; GFX8-NEXT: s_and_b32 s3, s6, 31 +; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -157,18 +151,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s2 -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: s_sub_u32 s4, 32, s2 -; GFX10-NEXT: s_sub_u32 s5, 32, s3 -; GFX10-NEXT: s_lshr_b32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s2, s0, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s1, s1, s5 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_sub_i32 s4, 32, s2 +; GFX10-NEXT: s_sub_i32 s5, 32, s3 +; GFX10-NEXT: s_mov_b32 s2, s1 +; GFX10-NEXT: s_mov_b32 s3, s1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_and_b32 s4, s4, 31 +; GFX10-NEXT: s_and_b32 s5, s5, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -178,18 +171,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s2, 32, s2 -; GFX11-NEXT: s_sub_i32 s3, 32, s3 -; GFX11-NEXT: s_sub_u32 s6, 32, s2 -; GFX11-NEXT: s_sub_u32 s7, 32, s3 -; GFX11-NEXT: s_lshr_b32 s3, s1, s3 -; GFX11-NEXT: s_lshr_b32 s2, s0, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, s6 -; GFX11-NEXT: s_lshl_b32 s1, s1, s7 -; GFX11-NEXT: s_or_b32 s0, s2, s0 -; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_sub_i32 s6, 32, s2 +; GFX11-NEXT: s_sub_i32 s7, 32, s3 +; GFX11-NEXT: s_mov_b32 s2, s1 +; GFX11-NEXT: s_mov_b32 s3, s1 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_and_b32 s6, s6, 31 +; GFX11-NEXT: s_and_b32 s7, s7, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm @@ -232,30 +224,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s2, 32, s12 -; SI-NEXT: s_sub_i32 s4, 32, s13 -; SI-NEXT: s_sub_i32 s5, 32, s14 -; SI-NEXT: s_sub_i32 s6, 32, s15 -; SI-NEXT: s_sub_u32 s7, 32, s2 -; SI-NEXT: s_sub_u32 s12, 32, s4 -; SI-NEXT: s_sub_u32 s13, 32, s5 -; SI-NEXT: s_sub_u32 s14, 32, s6 -; SI-NEXT: s_lshr_b32 s6, s11, s6 -; SI-NEXT: s_lshr_b32 s5, s10, s5 -; SI-NEXT: s_lshr_b32 s4, s9, s4 -; SI-NEXT: s_lshr_b32 s2, s8, s2 -; SI-NEXT: s_lshl_b32 s11, s11, s14 -; SI-NEXT: s_lshl_b32 s10, s10, s13 -; SI-NEXT: s_lshl_b32 s9, s9, s12 -; SI-NEXT: s_lshl_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s11 -; SI-NEXT: s_or_b32 s5, s5, s10 -; SI-NEXT: s_or_b32 s4, s4, s9 -; SI-NEXT: s_or_b32 s7, s2, s7 +; SI-NEXT: s_sub_i32 s12, 32, s13 +; SI-NEXT: s_sub_i32 s13, 32, s14 +; SI-NEXT: s_sub_i32 s14, 32, s15 +; SI-NEXT: s_mov_b32 s4, s11 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s11, s10 +; SI-NEXT: s_mov_b32 s6, s9 +; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_mov_b32 s9, s8 +; SI-NEXT: s_and_b32 s14, s14, 31 +; SI-NEXT: s_and_b32 s13, s13, 31 +; SI-NEXT: s_and_b32 s12, s12, 31 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s13 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s12 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s2 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -264,31 +254,29 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 32, s12 -; GFX8-NEXT: s_sub_i32 s3, 32, s13 +; GFX8-NEXT: s_sub_i32 s2, 32, s15 +; GFX8-NEXT: s_and_b32 s5, s2, 31 +; GFX8-NEXT: s_mov_b32 s2, s11 +; GFX8-NEXT: s_mov_b32 s3, s11 ; GFX8-NEXT: s_sub_i32 s4, 32, s14 -; GFX8-NEXT: s_sub_i32 s12, 32, s15 -; GFX8-NEXT: s_sub_u32 s5, 32, s2 -; GFX8-NEXT: s_sub_u32 s6, 32, s3 -; GFX8-NEXT: s_sub_u32 s7, 32, s4 -; GFX8-NEXT: s_sub_u32 s13, 32, s12 -; GFX8-NEXT: s_lshr_b32 s12, s11, s12 -; GFX8-NEXT: s_lshl_b32 s11, s11, s13 -; GFX8-NEXT: s_lshr_b32 s4, s10, s4 -; GFX8-NEXT: s_lshl_b32 s7, s10, s7 -; GFX8-NEXT: s_lshr_b32 s3, s9, s3 -; GFX8-NEXT: s_lshl_b32 s6, s9, s6 -; GFX8-NEXT: s_lshr_b32 s2, s8, s2 -; GFX8-NEXT: s_lshl_b32 s5, s8, s5 -; GFX8-NEXT: s_or_b32 s11, s12, s11 -; GFX8-NEXT: s_or_b32 s4, s4, s7 -; GFX8-NEXT: s_or_b32 s3, s3, s6 -; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX8-NEXT: s_sub_i32 s6, 32, s13 +; GFX8-NEXT: s_and_b32 s3, s4, 31 +; GFX8-NEXT: s_mov_b32 s11, s10 +; GFX8-NEXT: s_sub_i32 s12, 32, s12 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s3 +; GFX8-NEXT: s_and_b32 s3, s6, 31 +; GFX8-NEXT: s_mov_b32 s6, s9 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; GFX8-NEXT: s_and_b32 s3, s12, 31 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -300,30 +288,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s12 -; GFX10-NEXT: s_sub_i32 s3, 32, s13 -; GFX10-NEXT: s_sub_i32 s4, 32, s14 -; GFX10-NEXT: s_sub_i32 s5, 32, s15 -; GFX10-NEXT: s_sub_u32 s6, 32, s2 -; GFX10-NEXT: s_sub_u32 s7, 32, s3 -; GFX10-NEXT: s_sub_u32 s12, 32, s4 -; GFX10-NEXT: s_sub_u32 s13, 32, s5 -; GFX10-NEXT: s_lshr_b32 s5, s11, s5 -; GFX10-NEXT: s_lshr_b32 s4, s10, s4 -; GFX10-NEXT: s_lshr_b32 s3, s9, s3 -; GFX10-NEXT: s_lshr_b32 s2, s8, s2 -; GFX10-NEXT: s_lshl_b32 s11, s11, s13 -; GFX10-NEXT: s_lshl_b32 s10, s10, s12 -; GFX10-NEXT: s_lshl_b32 s7, s9, s7 -; GFX10-NEXT: s_lshl_b32 s6, s8, s6 -; GFX10-NEXT: s_or_b32 s5, s5, s11 -; GFX10-NEXT: s_or_b32 s4, s4, s10 -; GFX10-NEXT: s_or_b32 s2, s2, s6 -; GFX10-NEXT: s_or_b32 s3, s3, s7 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_sub_i32 s6, 32, s12 +; GFX10-NEXT: s_sub_i32 s7, 32, s13 +; GFX10-NEXT: s_sub_i32 s12, 32, s14 +; GFX10-NEXT: s_sub_i32 s13, 32, s15 +; GFX10-NEXT: s_mov_b32 s2, s11 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_mov_b32 s4, s9 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_and_b32 s13, s13, 31 +; GFX10-NEXT: s_and_b32 s12, s12, 31 +; GFX10-NEXT: s_and_b32 s14, s7, 31 +; GFX10-NEXT: s_and_b32 s15, s6, 31 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[10:11], s12 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s15 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -333,30 +319,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s2, 32, s12 -; GFX11-NEXT: s_sub_i32 s3, 32, s13 -; GFX11-NEXT: s_sub_i32 s4, 32, s14 -; GFX11-NEXT: s_sub_i32 s5, 32, s15 -; GFX11-NEXT: s_sub_u32 s6, 32, s2 -; GFX11-NEXT: s_sub_u32 s7, 32, s3 -; GFX11-NEXT: s_sub_u32 s12, 32, s4 -; GFX11-NEXT: s_sub_u32 s13, 32, s5 -; GFX11-NEXT: s_lshr_b32 s5, s11, s5 -; GFX11-NEXT: s_lshr_b32 s4, s10, s4 -; GFX11-NEXT: s_lshr_b32 s3, s9, s3 -; GFX11-NEXT: s_lshr_b32 s2, s8, s2 -; GFX11-NEXT: s_lshl_b32 s11, s11, s13 -; GFX11-NEXT: s_lshl_b32 s10, s10, s12 -; GFX11-NEXT: s_lshl_b32 s7, s9, s7 -; GFX11-NEXT: s_lshl_b32 s6, s8, s6 -; GFX11-NEXT: s_or_b32 s5, s5, s11 -; GFX11-NEXT: s_or_b32 s4, s4, s10 -; GFX11-NEXT: s_or_b32 s2, s2, s6 -; GFX11-NEXT: s_or_b32 s3, s3, s7 +; GFX11-NEXT: s_sub_i32 s6, 32, s12 +; GFX11-NEXT: s_sub_i32 s7, 32, s13 +; GFX11-NEXT: s_sub_i32 s12, 32, s14 +; GFX11-NEXT: s_sub_i32 s13, 32, s15 +; GFX11-NEXT: s_mov_b32 s2, s11 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s11, s10 +; GFX11-NEXT: s_mov_b32 s4, s9 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_and_b32 s13, s13, 31 +; GFX11-NEXT: s_and_b32 s12, s12, 31 +; GFX11-NEXT: s_and_b32 s14, s7, 31 +; GFX11-NEXT: s_and_b32 s15, s6, 31 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s12 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s15 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 2ec5e6c893a98..199433734529b 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -24,10 +24,9 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_u32 s4, 32, s3 -; SI-NEXT: s_lshr_b32 s3, s2, s3 -; SI-NEXT: s_lshl_b32 s2, s2, s4 -; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_and_b32 s4, s3, 31 +; SI-NEXT: s_mov_b32 s3, s2 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 @@ -39,10 +38,9 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_u32 s4, 32, s3 -; GFX8-NEXT: s_lshr_b32 s3, s2, s3 -; GFX8-NEXT: s_lshl_b32 s2, s2, s4 -; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s4, s3, 31 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -54,10 +52,9 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_u32 s4, 32, s3 -; GFX10-NEXT: s_lshr_b32 s3, s2, s3 -; GFX10-NEXT: s_lshl_b32 s2, s2, s4 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s4, s3, 31 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -66,11 +63,10 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_u32 s4, 32, s3 -; GFX11-NEXT: s_lshr_b32 s3, s2, s3 -; GFX11-NEXT: s_lshl_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s4, s3, 31 +; GFX11-NEXT: s_mov_b32 s3, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -79,11 +75,10 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_co_u32 s4, 32, s3 -; GFX12-NEXT: s_lshr_b32 s3, s2, s3 -; GFX12-NEXT: s_lshl_b32 s2, s2, s4 +; GFX12-NEXT: s_and_b32 s4, s3, 31 +; GFX12-NEXT: s_mov_b32 s3, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_or_b32 s2, s3, s2 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -115,17 +110,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_u32 s6, 32, s2 -; SI-NEXT: s_sub_u32 s8, 32, s3 -; SI-NEXT: s_lshr_b32 s3, s1, s3 -; SI-NEXT: s_lshr_b32 s2, s0, s2 -; SI-NEXT: s_lshl_b32 s1, s1, s8 -; SI-NEXT: s_lshl_b32 s0, s0, s6 -; SI-NEXT: s_or_b32 s1, s3, s1 -; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_and_b32 s3, s3, 31 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_and_b32 s6, s2, 31 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], s3 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -134,17 +128,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_u32 s6, 32, s2 -; GFX8-NEXT: s_sub_u32 s7, 32, s3 -; GFX8-NEXT: s_lshr_b32 s3, s1, s3 -; GFX8-NEXT: s_lshr_b32 s2, s0, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s7 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_or_b32 s1, s3, s1 -; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s3, s3, 31 +; GFX8-NEXT: s_mov_b32 s6, s1 +; GFX8-NEXT: s_mov_b32 s7, s1 +; GFX8-NEXT: s_and_b32 s8, s2, 31 +; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], s3 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -156,16 +149,15 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_u32 s4, 32, s2 -; GFX10-NEXT: s_sub_u32 s5, 32, s3 -; GFX10-NEXT: s_lshr_b32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s2, s0, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s1, s1, s5 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_and_b32 s3, s3, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -175,16 +167,15 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_u32 s6, 32, s2 -; GFX11-NEXT: s_sub_u32 s7, 32, s3 -; GFX11-NEXT: s_lshr_b32 s3, s1, s3 -; GFX11-NEXT: s_lshr_b32 s2, s0, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, s6 -; GFX11-NEXT: s_lshl_b32 s1, s1, s7 -; GFX11-NEXT: s_or_b32 s0, s2, s0 -; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_and_b32 s3, s3, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm @@ -195,16 +186,15 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_co_u32 s6, 32, s2 -; GFX12-NEXT: s_sub_co_u32 s7, 32, s3 -; GFX12-NEXT: s_lshr_b32 s3, s1, s3 -; GFX12-NEXT: s_lshr_b32 s2, s0, s2 -; GFX12-NEXT: s_lshl_b32 s0, s0, s6 -; GFX12-NEXT: s_lshl_b32 s1, s1, s7 -; GFX12-NEXT: s_or_b32 s0, s2, s0 -; GFX12-NEXT: s_or_b32 s1, s3, s1 +; GFX12-NEXT: s_mov_b32 s6, s1 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_and_b32 s2, s2, 31 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_and_b32 s3, s3, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm @@ -238,27 +228,25 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_u32 s2, 32, s12 -; SI-NEXT: s_sub_u32 s4, 32, s13 -; SI-NEXT: s_sub_u32 s5, 32, s14 -; SI-NEXT: s_sub_u32 s6, 32, s15 -; SI-NEXT: s_lshr_b32 s7, s11, s15 -; SI-NEXT: s_lshr_b32 s14, s10, s14 -; SI-NEXT: s_lshr_b32 s13, s9, s13 -; SI-NEXT: s_lshr_b32 s12, s8, s12 -; SI-NEXT: s_lshl_b32 s6, s11, s6 -; SI-NEXT: s_lshl_b32 s5, s10, s5 -; SI-NEXT: s_lshl_b32 s4, s9, s4 -; SI-NEXT: s_lshl_b32 s2, s8, s2 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s14, s5 -; SI-NEXT: s_or_b32 s4, s13, s4 -; SI-NEXT: s_or_b32 s7, s12, s2 +; SI-NEXT: s_and_b32 s2, s15, 31 +; SI-NEXT: s_mov_b32 s4, s11 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_and_b32 s14, s14, 31 +; SI-NEXT: s_mov_b32 s11, s10 +; SI-NEXT: s_and_b32 s13, s13, 31 +; SI-NEXT: s_mov_b32 s6, s9 +; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_and_b32 s12, s12, 31 +; SI-NEXT: s_mov_b32 s9, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s2 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s14 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s13 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -267,27 +255,25 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_u32 s5, 32, s15 -; GFX8-NEXT: s_sub_u32 s4, 32, s14 -; GFX8-NEXT: s_lshr_b32 s6, s11, s15 -; GFX8-NEXT: s_lshl_b32 s5, s11, s5 -; GFX8-NEXT: s_sub_u32 s3, 32, s13 -; GFX8-NEXT: s_or_b32 s5, s6, s5 -; GFX8-NEXT: s_lshr_b32 s6, s10, s14 -; GFX8-NEXT: s_lshl_b32 s4, s10, s4 -; GFX8-NEXT: s_sub_u32 s2, 32, s12 -; GFX8-NEXT: s_or_b32 s4, s6, s4 -; GFX8-NEXT: s_lshr_b32 s6, s9, s13 -; GFX8-NEXT: s_lshl_b32 s3, s9, s3 -; GFX8-NEXT: s_or_b32 s3, s6, s3 -; GFX8-NEXT: s_lshr_b32 s6, s8, s12 -; GFX8-NEXT: s_lshl_b32 s2, s8, s2 -; GFX8-NEXT: s_or_b32 s2, s6, s2 +; GFX8-NEXT: s_and_b32 s4, s15, 31 +; GFX8-NEXT: s_mov_b32 s2, s11 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_and_b32 s3, s13, 31 +; GFX8-NEXT: s_mov_b32 s6, s9 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: s_and_b32 s5, s14, 31 +; GFX8-NEXT: s_mov_b32 s11, s10 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; GFX8-NEXT: s_and_b32 s3, s12, 31 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -299,26 +285,24 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_u32 s2, 32, s12 -; GFX10-NEXT: s_sub_u32 s3, 32, s13 -; GFX10-NEXT: s_sub_u32 s4, 32, s14 -; GFX10-NEXT: s_sub_u32 s5, 32, s15 -; GFX10-NEXT: s_lshr_b32 s6, s11, s15 -; GFX10-NEXT: s_lshr_b32 s7, s10, s14 -; GFX10-NEXT: s_lshr_b32 s13, s9, s13 -; GFX10-NEXT: s_lshr_b32 s12, s8, s12 -; GFX10-NEXT: s_lshl_b32 s5, s11, s5 -; GFX10-NEXT: s_lshl_b32 s4, s10, s4 -; GFX10-NEXT: s_lshl_b32 s3, s9, s3 -; GFX10-NEXT: s_lshl_b32 s2, s8, s2 -; GFX10-NEXT: s_or_b32 s5, s6, s5 -; GFX10-NEXT: s_or_b32 s4, s7, s4 -; GFX10-NEXT: s_or_b32 s2, s12, s2 -; GFX10-NEXT: s_or_b32 s3, s13, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_and_b32 s6, s15, 31 +; GFX10-NEXT: s_mov_b32 s2, s11 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_and_b32 s7, s14, 31 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_and_b32 s13, s13, 31 +; GFX10-NEXT: s_mov_b32 s4, s9 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_and_b32 s12, s12, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -328,26 +312,24 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_u32 s2, 32, s12 -; GFX11-NEXT: s_sub_u32 s3, 32, s13 -; GFX11-NEXT: s_sub_u32 s4, 32, s14 -; GFX11-NEXT: s_sub_u32 s5, 32, s15 -; GFX11-NEXT: s_lshr_b32 s6, s11, s15 -; GFX11-NEXT: s_lshr_b32 s7, s10, s14 -; GFX11-NEXT: s_lshr_b32 s13, s9, s13 -; GFX11-NEXT: s_lshr_b32 s12, s8, s12 -; GFX11-NEXT: s_lshl_b32 s5, s11, s5 -; GFX11-NEXT: s_lshl_b32 s4, s10, s4 -; GFX11-NEXT: s_lshl_b32 s3, s9, s3 -; GFX11-NEXT: s_lshl_b32 s2, s8, s2 -; GFX11-NEXT: s_or_b32 s5, s6, s5 -; GFX11-NEXT: s_or_b32 s4, s7, s4 -; GFX11-NEXT: s_or_b32 s2, s12, s2 -; GFX11-NEXT: s_or_b32 s3, s13, s3 +; GFX11-NEXT: s_and_b32 s6, s15, 31 +; GFX11-NEXT: s_mov_b32 s2, s11 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_and_b32 s7, s14, 31 +; GFX11-NEXT: s_mov_b32 s11, s10 +; GFX11-NEXT: s_and_b32 s13, s13, 31 +; GFX11-NEXT: s_mov_b32 s4, s9 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_and_b32 s12, s12, 31 +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s13 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -357,26 +339,24 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_co_u32 s2, 32, s12 -; GFX12-NEXT: s_sub_co_u32 s3, 32, s13 -; GFX12-NEXT: s_sub_co_u32 s4, 32, s14 -; GFX12-NEXT: s_sub_co_u32 s5, 32, s15 -; GFX12-NEXT: s_lshr_b32 s6, s11, s15 -; GFX12-NEXT: s_lshr_b32 s7, s10, s14 -; GFX12-NEXT: s_lshr_b32 s13, s9, s13 -; GFX12-NEXT: s_lshr_b32 s12, s8, s12 -; GFX12-NEXT: s_lshl_b32 s5, s11, s5 -; GFX12-NEXT: s_lshl_b32 s4, s10, s4 -; GFX12-NEXT: s_lshl_b32 s3, s9, s3 -; GFX12-NEXT: s_lshl_b32 s2, s8, s2 -; GFX12-NEXT: s_or_b32 s5, s6, s5 -; GFX12-NEXT: s_or_b32 s4, s7, s4 -; GFX12-NEXT: s_or_b32 s2, s12, s2 -; GFX12-NEXT: s_or_b32 s3, s13, s3 +; GFX12-NEXT: s_and_b32 s6, s15, 31 +; GFX12-NEXT: s_mov_b32 s2, s11 +; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_and_b32 s7, s14, 31 +; GFX12-NEXT: s_mov_b32 s11, s10 +; GFX12-NEXT: s_and_b32 s13, s13, 31 +; GFX12-NEXT: s_mov_b32 s4, s9 +; GFX12-NEXT: s_mov_b32 s5, s9 +; GFX12-NEXT: s_and_b32 s12, s12, 31 +; GFX12-NEXT: s_mov_b32 s9, s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], s13 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -418,23 +398,44 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s19 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s17 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 -; SI-NEXT: v_mov_b32_e32 v4, s23 -; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4 -; SI-NEXT: v_mov_b32_e32 v4, s22 -; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4 -; SI-NEXT: v_mov_b32_e32 v4, s21 -; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4 -; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: s_and_b32 s24, s19, 31 +; SI-NEXT: s_mov_b32 s4, s11 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_and_b32 s25, s18, 31 +; SI-NEXT: s_mov_b32 s11, s10 +; SI-NEXT: s_and_b32 s26, s17, 31 +; SI-NEXT: s_mov_b32 s6, s9 +; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_and_b32 s27, s16, 31 +; SI-NEXT: s_mov_b32 s9, s8 +; SI-NEXT: s_and_b32 s23, s23, 31 +; SI-NEXT: s_mov_b32 s16, s15 +; SI-NEXT: s_mov_b32 s17, s15 +; SI-NEXT: s_and_b32 s22, s22, 31 +; SI-NEXT: s_mov_b32 s15, s14 +; SI-NEXT: s_and_b32 s21, s21, 31 +; SI-NEXT: s_mov_b32 s18, s13 +; SI-NEXT: s_mov_b32 s19, s13 +; SI-NEXT: s_and_b32 s20, s20, 31 +; SI-NEXT: s_mov_b32 s13, s12 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s24 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s25 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s26 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s23 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 +; SI-NEXT: s_lshr_b64 s[18:19], s[18:19], s21 +; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], s20 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s27 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -443,28 +444,48 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s23 -; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s21 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s19 -; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: s_and_b32 s4, s19, 31 +; GFX8-NEXT: s_mov_b32 s2, s11 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_and_b32 s3, s17, 31 +; GFX8-NEXT: s_mov_b32 s6, s9 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: s_and_b32 s5, s18, 31 +; GFX8-NEXT: s_mov_b32 s11, s10 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; GFX8-NEXT: s_and_b32 s3, s16, 31 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 +; GFX8-NEXT: s_and_b32 s3, s23, 31 +; GFX8-NEXT: s_mov_b32 s10, s15 +; GFX8-NEXT: s_mov_b32 s11, s15 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s3 +; GFX8-NEXT: s_and_b32 s3, s22, 31 +; GFX8-NEXT: s_mov_b32 s15, s14 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s3 +; GFX8-NEXT: s_and_b32 s3, s21, 31 +; GFX8-NEXT: s_mov_b32 s16, s13 +; GFX8-NEXT: s_mov_b32 s17, s13 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[16:17], s3 +; GFX8-NEXT: s_and_b32 s3, s20, 31 +; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 16 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -476,16 +497,44 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23 -; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22 -; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21 -; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16 -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: s_and_b32 s19, s19, 31 +; GFX10-NEXT: s_mov_b32 s2, s11 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_and_b32 s17, s17, 31 +; GFX10-NEXT: s_mov_b32 s4, s9 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_and_b32 s16, s16, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 +; GFX10-NEXT: s_and_b32 s23, s23, 31 +; GFX10-NEXT: s_mov_b32 s6, s15 +; GFX10-NEXT: s_mov_b32 s7, s15 +; GFX10-NEXT: s_and_b32 s22, s22, 31 +; GFX10-NEXT: s_mov_b32 s15, s14 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 +; GFX10-NEXT: s_mov_b32 s16, s13 +; GFX10-NEXT: s_mov_b32 s17, s13 +; GFX10-NEXT: s_and_b32 s3, s20, 31 +; GFX10-NEXT: s_mov_b32 s13, s12 +; GFX10-NEXT: s_and_b32 s5, s21, 31 +; GFX10-NEXT: s_and_b32 s18, s18, 31 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 +; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_mov_b32_e32 v6, s10 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v8i32: @@ -493,20 +542,88 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23 -; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22 -; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21 -; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20 -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16 +; GFX11-NEXT: s_and_b32 s19, s19, 31 +; GFX11-NEXT: s_mov_b32 s2, s11 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_and_b32 s17, s17, 31 +; GFX11-NEXT: s_mov_b32 s4, s9 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_and_b32 s16, s16, 31 +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 +; GFX11-NEXT: s_and_b32 s23, s23, 31 +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s15 +; GFX11-NEXT: s_and_b32 s22, s22, 31 +; GFX11-NEXT: s_mov_b32 s15, s14 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 +; GFX11-NEXT: s_mov_b32 s16, s13 +; GFX11-NEXT: s_mov_b32 s17, s13 +; GFX11-NEXT: s_and_b32 s3, s20, 31 +; GFX11-NEXT: s_mov_b32 s13, s12 +; GFX11-NEXT: s_and_b32 s5, s21, 31 +; GFX11-NEXT: s_and_b32 s18, s18, 31 +; GFX11-NEXT: s_mov_b32 s11, s10 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 +; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s16 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_mov_b32_e32 v6, s10 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rotr_v8i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s19, s19, 31 +; GFX12-NEXT: s_mov_b32 s2, s11 +; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_and_b32 s17, s17, 31 +; GFX12-NEXT: s_mov_b32 s4, s9 +; GFX12-NEXT: s_mov_b32 s5, s9 +; GFX12-NEXT: s_and_b32 s16, s16, 31 +; GFX12-NEXT: s_mov_b32 s9, s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 +; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 +; GFX12-NEXT: s_and_b32 s23, s23, 31 +; GFX12-NEXT: s_mov_b32 s6, s15 +; GFX12-NEXT: s_mov_b32 s7, s15 +; GFX12-NEXT: s_and_b32 s22, s22, 31 +; GFX12-NEXT: s_mov_b32 s15, s14 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 +; GFX12-NEXT: s_mov_b32 s16, s13 +; GFX12-NEXT: s_mov_b32 s17, s13 +; GFX12-NEXT: s_and_b32 s3, s20, 31 +; GFX12-NEXT: s_mov_b32 s13, s12 +; GFX12-NEXT: s_and_b32 s5, s21, 31 +; GFX12-NEXT: s_and_b32 s18, s18, 31 +; GFX12-NEXT: s_mov_b32 s11, s10 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 +; GFX12-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 +; GFX12-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 +; GFX12-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 +; GFX12-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s16 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s6 +; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s4 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s2 +; GFX12-NEXT: v_mov_b32_e32 v6, s10 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = sub <8 x i32> , %y %tmp1 = shl <8 x i32> %x, %tmp0