diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 971dfdbe3e70a..1f7d779f768a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -504,9 +504,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // The hardware supports 32-bit FSHR, but not FSHL. setOperationAction(ISD::FSHR, MVT::i32, Legal); - // The hardware supports 32-bit ROTR, but not ROTL. - setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); + setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand); setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index bd443b5b6f1e6..ddcb431f39a87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -806,12 +806,6 @@ class DwordAddrPat : AMDGPUPat < (vt rc:$addr) >; -// rotr pattern -class ROTRPattern : AMDGPUPat < - (rotr i32:$src0, i32:$src1), - (BIT_ALIGN $src0, $src0, $src1) ->; - // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index dadc7dcd7054a..a2e3ecef1c206 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -505,7 +505,6 @@ def : AMDGPUPat < (fshr i32:$src0, i32:$src1, i32:$src2), (BIT_ALIGN_INT_eg $src0, $src1, $src2) >; -def : ROTRPattern ; def MULADD_eg : MULADD_Common<0x14>; def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; def FMA_eg : FMA_Common<0x7>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 301f2fc8dab45..4746bc3bf6a18 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14032,6 +14032,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { assert(OtherOp.getValueSizeInBits() == 32); } + // Check that we haven't just recreated the same FSHR node. + if (N->getOpcode() == ISD::FSHR && + (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && + (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) + return SDValue(); + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { assert(Op.getValueType().isByteSized() && diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index ca5a4d7301bda..0846dd66bfcbf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2685,8 +2685,6 @@ def : AMDGPUPat < let True16Predicate = NotHasTrue16BitInsts in { let SubtargetPredicate = isNotGFX9Plus in { -def : ROTRPattern ; - def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; @@ -2697,14 +2695,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (i32 ShiftAmt32Imm: } // isNotGFX9Plus let SubtargetPredicate = isGFX9GFX10 in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - foreach pat = [(i32 (DivergentUnaryFrag (srl i64:$src0, (and i32:$src1, (i32 31))))), (i32 (DivergentUnaryFrag (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in def : GCNPat; - def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), @@ -2753,14 +2734,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), } // end True16Predicate = UseRealTrue16Insts let True16Predicate = UseFakeTrue16Insts in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll index b9bf13886d366..9d620d671dd8a 100644 --- a/llvm/test/CodeGen/AMDGPU/packetizer.ll +++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll @@ -5,43 +5,37 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { ; R600-LABEL: test: ; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: ADD_INT T0.Y, KC0[3].X, 1, -; R600-NEXT: ADD_INT T0.Z, KC0[3].Y, 1, -; R600-NEXT: ADD_INT T0.W, KC0[2].Z, 1, -; R600-NEXT: ADD_INT * T1.W, KC0[2].W, 1, -; R600-NEXT: BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z, -; R600-NEXT: BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z, -; R600-NEXT: BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z, -; R600-NEXT: BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z, -; R600-NEXT: OR_INT T0.W, PV.W, PV.Z, -; R600-NEXT: OR_INT * T1.W, PV.Y, PV.X, -; R600-NEXT: OR_INT T0.X, PS, PV.W, +; R600-NEXT: ADD_INT T0.Y, KC0[2].W, 1, +; R600-NEXT: ADD_INT T0.Z, KC0[2].Z, 1, +; R600-NEXT: ADD_INT T0.W, KC0[3].Y, 1, +; R600-NEXT: ADD_INT * T1.W, KC0[3].X, 1, +; R600-NEXT: OR_INT T0.W, PS, PV.W, +; R600-NEXT: OR_INT * T1.W, PV.Z, PV.Y, +; R600-NEXT: OR_INT * T0.W, PS, PV.W, +; R600-NEXT: BIT_ALIGN_INT T0.X, PV.W, PV.W, KC0[3].Z, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: test: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: -; CM-NEXT: ADD_INT T0.X, KC0[3].X, 1, -; CM-NEXT: ADD_INT T0.Y, KC0[3].Y, 1, -; CM-NEXT: ADD_INT T0.Z, KC0[2].Z, 1, -; CM-NEXT: ADD_INT * T0.W, KC0[2].W, 1, -; CM-NEXT: BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z, -; CM-NEXT: BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z, -; CM-NEXT: BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z, -; CM-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z, +; CM-NEXT: ADD_INT T0.X, KC0[2].W, 1, +; CM-NEXT: ADD_INT T0.Y, KC0[2].Z, 1, +; CM-NEXT: ADD_INT T0.Z, KC0[3].Y, 1, +; CM-NEXT: ADD_INT * T0.W, KC0[3].X, 1, ; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, ; CM-NEXT: OR_INT * T0.W, PV.Y, PV.X, -; CM-NEXT: OR_INT * T0.X, PV.W, PV.Z, +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: BIT_ALIGN_INT * T0.X, PV.W, PV.W, KC0[3].Z, ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index 0d7e73c326cd8..c98bcd53bec1a 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -118,14 +118,13 @@ define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %a ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 +; GCN-NEXT: v_alignbit_b32 v2, v2, s2, 24 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 0a1d15bf945f9..a7fcb6439703a 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -25,12 +25,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 +; SI-NEXT: s_sub_i32 s4, 32, s3 +; SI-NEXT: s_mov_b32 s3, s2 +; SI-NEXT: s_and_b32 s4, s4, 31 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -38,11 +40,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 +; GFX8-NEXT: s_sub_i32 s4, 32, s3 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s4, s4, 31 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -52,18 +56,24 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: s_and_b32 s4, s3, 31 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s4, s3, 31 +; GFX11-NEXT: s_mov_b32 s3, s2 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -97,14 +107,19 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_sub_i32 s2, 32, s2 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_sub_i32 s6, 32, s2 +; SI-NEXT: s_sub_i32 s8, 32, s3 +; SI-NEXT: s_mov_b32 s2, s1 +; SI-NEXT: s_mov_b32 s3, s1 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: s_and_b32 s8, s8, 31 +; SI-NEXT: s_and_b32 s6, s6, 31 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -113,13 +128,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 32, s2 -; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: s_sub_i32 s7, 32, s3 +; GFX8-NEXT: s_sub_i32 s6, 32, s2 +; GFX8-NEXT: s_mov_b32 s2, s1 +; GFX8-NEXT: s_mov_b32 s3, s1 +; GFX8-NEXT: s_and_b32 s1, s7, 31 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s1 +; GFX8-NEXT: s_and_b32 s3, s6, 31 +; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -131,10 +151,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: s_sub_i32 s2, 32, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: s_sub_i32 s4, 32, s2 +; GFX10-NEXT: s_sub_i32 s5, 32, s3 +; GFX10-NEXT: s_mov_b32 s2, s1 +; GFX10-NEXT: s_mov_b32 s3, s1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_and_b32 s4, s4, 31 +; GFX10-NEXT: s_and_b32 s5, s5, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -143,12 +170,19 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s3, 32, s3 -; GFX11-NEXT: s_sub_i32 s2, 32, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: s_sub_i32 s6, 32, s2 +; GFX11-NEXT: s_sub_i32 s7, 32, s3 +; GFX11-NEXT: s_mov_b32 s2, s1 +; GFX11-NEXT: s_mov_b32 s3, s1 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_and_b32 s6, s6, 31 +; GFX11-NEXT: s_and_b32 s7, s7, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -188,20 +222,30 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s4, 32, s12 -; SI-NEXT: s_sub_i32 s5, 32, s13 -; SI-NEXT: s_sub_i32 s6, 32, s15 -; SI-NEXT: s_sub_i32 s7, 32, s14 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: s_sub_i32 s2, 32, s12 +; SI-NEXT: s_sub_i32 s12, 32, s13 +; SI-NEXT: s_sub_i32 s13, 32, s14 +; SI-NEXT: s_sub_i32 s14, 32, s15 +; SI-NEXT: s_mov_b32 s4, s11 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s11, s10 +; SI-NEXT: s_mov_b32 s6, s9 +; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_mov_b32 s9, s8 +; SI-NEXT: s_and_b32 s14, s14, 31 +; SI-NEXT: s_and_b32 s13, s13, 31 +; SI-NEXT: s_and_b32 s12, s12, 31 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s13 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s12 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -210,19 +254,29 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s5, 32, s15 +; GFX8-NEXT: s_sub_i32 s2, 32, s15 +; GFX8-NEXT: s_and_b32 s5, s2, 31 +; GFX8-NEXT: s_mov_b32 s2, s11 +; GFX8-NEXT: s_mov_b32 s3, s11 ; GFX8-NEXT: s_sub_i32 s4, 32, s14 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s3, 32, s13 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_sub_i32 s2, 32, s12 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX8-NEXT: s_sub_i32 s6, 32, s13 +; GFX8-NEXT: s_and_b32 s3, s4, 31 +; GFX8-NEXT: s_mov_b32 s11, s10 +; GFX8-NEXT: s_sub_i32 s12, 32, s12 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s3 +; GFX8-NEXT: s_and_b32 s3, s6, 31 +; GFX8-NEXT: s_mov_b32 s6, s9 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; GFX8-NEXT: s_and_b32 s3, s12, 31 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -234,14 +288,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s12 -; GFX10-NEXT: s_sub_i32 s3, 32, s13 -; GFX10-NEXT: s_sub_i32 s4, 32, s15 -; GFX10-NEXT: s_sub_i32 s5, 32, s14 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2 +; GFX10-NEXT: s_sub_i32 s6, 32, s12 +; GFX10-NEXT: s_sub_i32 s7, 32, s13 +; GFX10-NEXT: s_sub_i32 s12, 32, s14 +; GFX10-NEXT: s_sub_i32 s13, 32, s15 +; GFX10-NEXT: s_mov_b32 s2, s11 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_mov_b32 s4, s9 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_and_b32 s13, s13, 31 +; GFX10-NEXT: s_and_b32 s12, s12, 31 +; GFX10-NEXT: s_and_b32 s14, s7, 31 +; GFX10-NEXT: s_and_b32 s15, s6, 31 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[10:11], s12 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s15 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -250,16 +318,29 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s2, 32, s12 -; GFX11-NEXT: s_sub_i32 s3, 32, s13 -; GFX11-NEXT: s_sub_i32 s4, 32, s15 -; GFX11-NEXT: s_sub_i32 s5, 32, s14 -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2 +; GFX11-NEXT: s_sub_i32 s6, 32, s12 +; GFX11-NEXT: s_sub_i32 s7, 32, s13 +; GFX11-NEXT: s_sub_i32 s12, 32, s14 +; GFX11-NEXT: s_sub_i32 s13, 32, s15 +; GFX11-NEXT: s_mov_b32 s2, s11 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s11, s10 +; GFX11-NEXT: s_mov_b32 s4, s9 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_and_b32 s13, s13, 31 +; GFX11-NEXT: s_and_b32 s12, s12, 31 +; GFX11-NEXT: s_and_b32 s14, s7, 31 +; GFX11-NEXT: s_and_b32 s15, s6, 31 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s12 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s15 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 403a556688091..71c7797cbc68e 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -22,12 +22,14 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s3, 31 +; SI-NEXT: s_mov_b32 s3, s2 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -35,10 +37,12 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 +; GFX8-NEXT: s_and_b32 s4, s3, 31 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -47,16 +51,22 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: s_and_b32 s4, s3, 31 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX11-NEXT: s_and_b32 s4, s3, 31 +; GFX11-NEXT: s_mov_b32 s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -86,12 +96,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_and_b32 s3, s3, 31 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_and_b32 s6, s2, 31 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], s3 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -100,11 +115,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: s_and_b32 s3, s3, 31 +; GFX8-NEXT: s_mov_b32 s6, s1 +; GFX8-NEXT: s_mov_b32 s7, s1 +; GFX8-NEXT: s_and_b32 s8, s2, 31 +; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], s3 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -116,8 +136,15 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_and_b32 s3, s3, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -126,10 +153,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_and_b32 s3, s3, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -161,16 +195,26 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: s_and_b32 s2, s15, 31 +; SI-NEXT: s_mov_b32 s4, s11 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_and_b32 s14, s14, 31 +; SI-NEXT: s_mov_b32 s11, s10 +; SI-NEXT: s_and_b32 s13, s13, 31 +; SI-NEXT: s_mov_b32 s6, s9 +; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_and_b32 s12, s12, 31 +; SI-NEXT: s_mov_b32 s9, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s2 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s14 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s13 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -179,15 +223,25 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s15 -; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: s_and_b32 s4, s15, 31 +; GFX8-NEXT: s_mov_b32 s2, s11 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_and_b32 s3, s13, 31 +; GFX8-NEXT: s_mov_b32 s6, s9 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: s_and_b32 s5, s14, 31 +; GFX8-NEXT: s_mov_b32 s11, s10 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; GFX8-NEXT: s_and_b32 s3, s12, 31 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -199,10 +253,24 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s15 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s14 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s13 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s12 +; GFX10-NEXT: s_and_b32 s6, s15, 31 +; GFX10-NEXT: s_mov_b32 s2, s11 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_and_b32 s7, s14, 31 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_and_b32 s13, s13, 31 +; GFX10-NEXT: s_mov_b32 s4, s9 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_and_b32 s12, s12, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -211,12 +279,25 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s15 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s14 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s13 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s12 +; GFX11-NEXT: s_and_b32 s6, s15, 31 +; GFX11-NEXT: s_mov_b32 s2, s11 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_and_b32 s7, s14, 31 +; GFX11-NEXT: s_mov_b32 s11, s10 +; GFX11-NEXT: s_and_b32 s13, s13, 31 +; GFX11-NEXT: s_mov_b32 s4, s9 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_and_b32 s12, s12, 31 +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -258,23 +339,44 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s19 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s17 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 -; SI-NEXT: v_mov_b32_e32 v4, s23 -; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4 -; SI-NEXT: v_mov_b32_e32 v4, s22 -; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4 -; SI-NEXT: v_mov_b32_e32 v4, s21 -; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4 -; SI-NEXT: v_mov_b32_e32 v4, s20 -; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: s_and_b32 s24, s19, 31 +; SI-NEXT: s_mov_b32 s4, s11 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_and_b32 s25, s18, 31 +; SI-NEXT: s_mov_b32 s11, s10 +; SI-NEXT: s_and_b32 s26, s17, 31 +; SI-NEXT: s_mov_b32 s6, s9 +; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_and_b32 s27, s16, 31 +; SI-NEXT: s_mov_b32 s9, s8 +; SI-NEXT: s_and_b32 s23, s23, 31 +; SI-NEXT: s_mov_b32 s16, s15 +; SI-NEXT: s_mov_b32 s17, s15 +; SI-NEXT: s_and_b32 s22, s22, 31 +; SI-NEXT: s_mov_b32 s15, s14 +; SI-NEXT: s_and_b32 s21, s21, 31 +; SI-NEXT: s_mov_b32 s18, s13 +; SI-NEXT: s_mov_b32 s19, s13 +; SI-NEXT: s_and_b32 s20, s20, 31 +; SI-NEXT: s_mov_b32 s13, s12 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s24 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s25 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s26 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s23 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 +; SI-NEXT: s_lshr_b64 s[18:19], s[18:19], s21 +; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], s20 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s27 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -283,28 +385,48 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s23 -; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s21 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s19 -; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: s_and_b32 s4, s19, 31 +; GFX8-NEXT: s_mov_b32 s2, s11 +; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_and_b32 s3, s17, 31 +; GFX8-NEXT: s_mov_b32 s6, s9 +; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: s_and_b32 s5, s18, 31 +; GFX8-NEXT: s_mov_b32 s11, s10 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; GFX8-NEXT: s_and_b32 s3, s16, 31 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 +; GFX8-NEXT: s_and_b32 s3, s23, 31 +; GFX8-NEXT: s_mov_b32 s10, s15 +; GFX8-NEXT: s_mov_b32 s11, s15 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s3 +; GFX8-NEXT: s_and_b32 s3, s22, 31 +; GFX8-NEXT: s_mov_b32 s15, s14 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s3 +; GFX8-NEXT: s_and_b32 s3, s21, 31 +; GFX8-NEXT: s_mov_b32 s16, s13 +; GFX8-NEXT: s_mov_b32 s17, s13 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[16:17], s3 +; GFX8-NEXT: s_and_b32 s3, s20, 31 +; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 16 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -316,16 +438,44 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23 -; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22 -; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21 -; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16 -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: s_and_b32 s19, s19, 31 +; GFX10-NEXT: s_mov_b32 s2, s11 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_and_b32 s17, s17, 31 +; GFX10-NEXT: s_mov_b32 s4, s9 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_and_b32 s16, s16, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 +; GFX10-NEXT: s_and_b32 s23, s23, 31 +; GFX10-NEXT: s_mov_b32 s6, s15 +; GFX10-NEXT: s_mov_b32 s7, s15 +; GFX10-NEXT: s_and_b32 s22, s22, 31 +; GFX10-NEXT: s_mov_b32 s15, s14 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 +; GFX10-NEXT: s_mov_b32 s16, s13 +; GFX10-NEXT: s_mov_b32 s17, s13 +; GFX10-NEXT: s_and_b32 s3, s20, 31 +; GFX10-NEXT: s_mov_b32 s13, s12 +; GFX10-NEXT: s_and_b32 s5, s21, 31 +; GFX10-NEXT: s_and_b32 s18, s18, 31 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 +; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_mov_b32_e32 v6, s10 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v8i32: @@ -333,19 +483,43 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23 -; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22 -; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21 -; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20 -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16 +; GFX11-NEXT: s_and_b32 s19, s19, 31 +; GFX11-NEXT: s_mov_b32 s2, s11 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_and_b32 s17, s17, 31 +; GFX11-NEXT: s_mov_b32 s4, s9 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_and_b32 s16, s16, 31 +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 +; GFX11-NEXT: s_and_b32 s23, s23, 31 +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s15 +; GFX11-NEXT: s_and_b32 s22, s22, 31 +; GFX11-NEXT: s_mov_b32 s15, s14 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 +; GFX11-NEXT: s_mov_b32 s16, s13 +; GFX11-NEXT: s_mov_b32 s17, s13 +; GFX11-NEXT: s_and_b32 s3, s20, 31 +; GFX11-NEXT: s_mov_b32 s13, s12 +; GFX11-NEXT: s_and_b32 s5, s21, 31 +; GFX11-NEXT: s_and_b32 s18, s18, 31 +; GFX11-NEXT: s_mov_b32 s11, s10 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 +; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s16 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_mov_b32_e32 v6, s10 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub <8 x i32> , %y diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 28330bfc9bb69..acf999e586a68 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -1470,21 +1470,20 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; ; EG-LABEL: s_shl_inline_imm_1_i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, -; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 31(4.344025e-44), 26(3.643376e-44) -; EG-NEXT: ASHR T1.W, PS, literal.x, -; EG-NEXT: LSHL * T0.W, 1, PV.W, +; EG-NEXT: NOT_INT * T1.W, KC0[2].W, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.Y, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, +; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, 0.0, PS, +; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.x, +; EG-NEXT: LSHL * T0.W, 1, PV.W, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, 0.0, +; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS, +; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %shl = shl i64 1, %a