diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2fd4479ca5fe0b..1a54c95b75a6ea 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4949,56 +4949,60 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, MVT IndexContainerVT = ContainerVT.changeVectorElementType(IndexVT.getScalarType()); - SDValue Gather; - // TODO: This doesn't trigger for i64 vectors on RV32, since there we - // encounter a bitcasted BUILD_VECTOR with low/high i32 values. - if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) { - Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG, - Subtarget); - } else { + // Base case for the recursion just below - handle the worst case + // single source permutation. Note that all the splat variants + // are handled above. + if (V2.isUndef()) { V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); - // If only one index is used, we can use a "splat" vrgather. - // TODO: We can splat the most-common index and fix-up any stragglers, if - // that's beneficial. - if (LHSIndexCounts.size() == 1) { - int SplatIndex = LHSIndexCounts.begin()->getFirst(); - Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1, - DAG.getConstant(SplatIndex, DL, XLenVT), - DAG.getUNDEF(ContainerVT), TrueMask, VL); - } else { - SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); - LHSIndices = - convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget); - - Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices, - DAG.getUNDEF(ContainerVT), TrueMask, VL); + SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); + LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG, + Subtarget); + SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices, + DAG.getUNDEF(ContainerVT), TrueMask, VL); + return convertFromScalableVector(VT, Gather, DAG, Subtarget); + } + + // Translate the gather index we computed above (and possibly swapped) + // back to a shuffle mask. This step should disappear once we complete + // the migration to recursive design. + SmallVector ShuffleMaskLHS; + ShuffleMaskLHS.reserve(GatherIndicesLHS.size()); + for (SDValue GatherIndex : GatherIndicesLHS) { + if (GatherIndex.isUndef()) { + ShuffleMaskLHS.push_back(-1); + continue; } + auto *IdxC = cast(GatherIndex); + ShuffleMaskLHS.push_back(IdxC->getZExtValue()); } - // If a second vector operand is used by this shuffle, blend it in with an - // additional vrgather. - if (!V2.isUndef()) { - V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); + // Recursively invoke lowering for the LHS as if there were no RHS. + // This allows us to leverage all of our single source permute tricks. + SDValue Gather = + DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS); + Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget); - MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1); - SelectMask = - convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget); + // Blend in second vector source with an additional vrgather. + V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); - // If only one index is used, we can use a "splat" vrgather. - // TODO: We can splat the most-common index and fix-up any stragglers, if - // that's beneficial. - if (RHSIndexCounts.size() == 1) { - int SplatIndex = RHSIndexCounts.begin()->getFirst(); - Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2, - DAG.getConstant(SplatIndex, DL, XLenVT), Gather, - SelectMask, VL); - } else { - SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); - RHSIndices = - convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); - Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather, - SelectMask, VL); - } + MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1); + SelectMask = + convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget); + + // If only one index is used, we can use a "splat" vrgather. + // TODO: We can splat the most-common index and fix-up any stragglers, if + // that's beneficial. + if (RHSIndexCounts.size() == 1) { + int SplatIndex = RHSIndexCounts.begin()->getFirst(); + Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2, + DAG.getConstant(SplatIndex, DL, XLenVT), Gather, + SelectMask, VL); + } else { + SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); + RHSIndices = + convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); + Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather, + SelectMask, VL); } return convertFromScalableVector(VT, Gather, DAG, Subtarget); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index 799aebcaa63026..dab530751ef96b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -238,39 +238,26 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) { define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-LABEL: interleave_v32f32: ; V128: # %bb.0: -; V128-NEXT: addi sp, sp, -16 -; V128-NEXT: .cfi_def_cfa_offset 16 -; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 -; V128-NEXT: sub sp, sp, a0 -; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; V128-NEXT: lui a0, %hi(.LCPI10_0) -; V128-NEXT: addi a0, a0, %lo(.LCPI10_0) -; V128-NEXT: li a1, 32 -; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; V128-NEXT: vle16.v v4, (a0) -; V128-NEXT: lui a0, %hi(.LCPI10_1) -; V128-NEXT: addi a0, a0, %lo(.LCPI10_1) -; V128-NEXT: vle16.v v24, (a0) -; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill -; V128-NEXT: lui a0, 699051 -; V128-NEXT: addi a0, a0, -1366 -; V128-NEXT: vmv.s.x v0, a0 -; V128-NEXT: vrgatherei16.vv v24, v8, v4 -; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload +; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; V128-NEXT: vslidedown.vi v0, v8, 16 +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vwaddu.vv v24, v0, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v24, a0, v8 +; V128-NEXT: lui a1, %hi(.LCPI10_0) +; V128-NEXT: addi a1, a1, %lo(.LCPI10_0) +; V128-NEXT: li a2, 32 +; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; V128-NEXT: vle16.v v12, (a1) +; V128-NEXT: lui a1, 699051 +; V128-NEXT: addi a1, a1, -1366 +; V128-NEXT: vmv.s.x v0, a1 ; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v0, v8, v16 -; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v0, a0, v16 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 -; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 -; V128-NEXT: add sp, sp, a0 -; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32f32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index e1bd16649eede7..9e21cc9e3d624a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -188,24 +188,30 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { ; V128-LABEL: interleave_v4i32_offset_1: ; V128: # %bb.0: +; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V128-NEXT: vwaddu.vv v10, v8, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v10, a0, v8 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; V128-NEXT: vid.v v10 -; V128-NEXT: vsrl.vi v11, v10, 1 -; V128-NEXT: vrgather.vv v10, v8, v11 +; V128-NEXT: vid.v v8 +; V128-NEXT: vsrl.vi v8, v8, 1 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vadd.vi v8, v11, 1 +; V128-NEXT: vadd.vi v8, v8, 1 ; V128-NEXT: vrgather.vv v10, v9, v8, v0.t ; V128-NEXT: vmv.v.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i32_offset_1: ; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V512-NEXT: vwaddu.vv v10, v8, v8 +; V512-NEXT: li a0, -1 +; V512-NEXT: vwmaccu.vx v10, a0, v8 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu -; V512-NEXT: vid.v v10 -; V512-NEXT: vsrl.vi v11, v10, 1 -; V512-NEXT: vrgather.vv v10, v8, v11 +; V512-NEXT: vid.v v8 +; V512-NEXT: vsrl.vi v8, v8, 1 ; V512-NEXT: vmv.v.i v0, 10 -; V512-NEXT: vadd.vi v8, v11, 1 +; V512-NEXT: vadd.vi v8, v8, 1 ; V512-NEXT: vrgather.vv v10, v9, v8, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret @@ -397,39 +403,26 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) { define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-LABEL: interleave_v32i32: ; V128: # %bb.0: -; V128-NEXT: addi sp, sp, -16 -; V128-NEXT: .cfi_def_cfa_offset 16 -; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 -; V128-NEXT: sub sp, sp, a0 -; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; V128-NEXT: lui a0, %hi(.LCPI17_0) -; V128-NEXT: addi a0, a0, %lo(.LCPI17_0) -; V128-NEXT: li a1, 32 -; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; V128-NEXT: vle16.v v4, (a0) -; V128-NEXT: lui a0, %hi(.LCPI17_1) -; V128-NEXT: addi a0, a0, %lo(.LCPI17_1) -; V128-NEXT: vle16.v v24, (a0) -; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill -; V128-NEXT: lui a0, 699051 -; V128-NEXT: addi a0, a0, -1366 -; V128-NEXT: vmv.s.x v0, a0 -; V128-NEXT: vrgatherei16.vv v24, v8, v4 -; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload +; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; V128-NEXT: vslidedown.vi v0, v8, 16 +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vwaddu.vv v24, v0, v8 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v24, a0, v8 +; V128-NEXT: lui a1, %hi(.LCPI17_0) +; V128-NEXT: addi a1, a1, %lo(.LCPI17_0) +; V128-NEXT: li a2, 32 +; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; V128-NEXT: vle16.v v12, (a1) +; V128-NEXT: lui a1, 699051 +; V128-NEXT: addi a1, a1, -1366 +; V128-NEXT: vmv.s.x v0, a1 ; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v0, v8, v16 -; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v0, a0, v16 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 -; V128-NEXT: csrr a0, vlenb -; V128-NEXT: slli a0, a0, 2 -; V128-NEXT: add sp, sp, a0 -; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index a56a81f5f793bc..a26a87a1f3c139 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -612,13 +612,11 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: concat_4xi8_start_undef_at_start: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 224 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -4 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -4 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -628,13 +626,11 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_start_into_end_non_contiguous: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 144 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -4 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -4 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -675,13 +671,11 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_slidedown: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: li a0, 195 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -692,14 +686,12 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 2 -; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 ; CHECK-NEXT: li a0, 234 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -710,16 +702,13 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: unmergable: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vi v11, v10, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI46_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: li a0, 234 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: vrgather.vv v10, v9, v12, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index eeb8e517d01d2d..f889041647b235 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -8,23 +8,51 @@ ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) { -; CHECK-LABEL: load_factor2_v3: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vadd.vv v9, v8, v8 -; CHECK-NEXT: vrgather.vv v8, v10, v9 -; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t -; CHECK-NEXT: vadd.vi v11, v9, 1 -; CHECK-NEXT: vrgather.vv v9, v10, v11 -; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: load_factor2_v3: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vle32.v v10, (a0) +; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v10, 2 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vwaddu.vv v8, v10, v9 +; RV32-NEXT: li a0, -1 +; RV32-NEXT: vwmaccu.vx v8, a0, v9 +; RV32-NEXT: vmv.v.i v0, 4 +; RV32-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV32-NEXT: vslidedown.vi v12, v10, 4 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV32-NEXT: vrgather.vi v8, v12, 0, v0.t +; RV32-NEXT: vid.v v9 +; RV32-NEXT: vadd.vv v9, v9, v9 +; RV32-NEXT: vadd.vi v11, v9, 1 +; RV32-NEXT: vrgather.vv v9, v10, v11 +; RV32-NEXT: vrgather.vi v9, v12, 1, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor2_v3: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vle32.v v10, (a0) +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vid.v v8 +; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vadd.vi v8, v8, 1 +; RV64-NEXT: vrgather.vv v9, v10, v8 +; RV64-NEXT: vmv.v.i v0, 4 +; RV64-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64-NEXT: vslidedown.vi v12, v10, 4 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vrgather.vi v9, v12, 1, v0.t +; RV64-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vi v11, v10, 2 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vwaddu.vv v8, v10, v11 +; RV64-NEXT: li a0, -1 +; RV64-NEXT: vwmaccu.vx v8, a0, v11 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64-NEXT: vrgather.vi v8, v12, 0, v0.t +; RV64-NEXT: ret %interleaved.vec = load <6 x i32>, ptr %ptr %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> @@ -131,163 +159,142 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 62 +; RV32-NEXT: li a3, 58 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 62 * vlenb -; RV32-NEXT: addi a3, a1, 128 -; RV32-NEXT: addi a4, a1, 256 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb +; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a4) +; RV32-NEXT: vle32.v v8, (a3) +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 25 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a1, 128 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vslideup.vi v16, v8, 4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 29 +; RV32-NEXT: li a5, 12 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vid.v v10 +; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vid.v v20 +; RV32-NEXT: vadd.vi v4, v20, -10 +; RV32-NEXT: vmv.v.v v2, v20 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 3 +; RV32-NEXT: slli a5, a4, 4 ; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs2r.v v10, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vadd.vi v8, v10, -4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 13 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v12, v16, v8 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 21 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v12, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v10, -10 +; RV32-NEXT: vs2r.v v20, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 12 -; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: vmv.s.x v1, a4 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: slli a5, a4, 5 +; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v16, 16 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 45 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV32-NEXT: vs1r.v v1, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vrgatherei16.vv v16, v8, v4, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 25 +; RV32-NEXT: li a5, 21 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v12, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, %hi(.LCPI6_0) ; RV32-NEXT: addi a4, a4, %lo(.LCPI6_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: lui a5, %hi(.LCPI6_1) -; RV32-NEXT: addi a5, a5, %lo(.LCPI6_1) -; RV32-NEXT: lui a6, 1 ; RV32-NEXT: vle16.v v8, (a4) -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vle16.v v8, (a5) ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 2 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, %hi(.LCPI6_1) +; RV32-NEXT: addi a4, a4, %lo(.LCPI6_1) +; RV32-NEXT: lui a5, 1 +; RV32-NEXT: vle16.v v8, (a4) +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 37 +; RV32-NEXT: li a4, 49 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, a6, -64 +; RV32-NEXT: addi a1, a5, -64 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v4 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vi v8, v10, -2 -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v12, v8, 2 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v10, -8 -; RV32-NEXT: vmv2r.v v30, v10 +; RV32-NEXT: vadd.vi v8, v2, -8 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v28, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv1r.v v0, v28 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v24, v12 +; RV32-NEXT: vmv.v.v v20, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu @@ -301,166 +308,165 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t +; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v8 +; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v4, v16, v8 +; RV32-NEXT: vrgatherei16.vv v24, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v30, -6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vi v8, v8, -6 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; RV32-NEXT: vmv1r.v v0, v28 -; RV32-NEXT: vmv1r.v v2, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v4, v16, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_6) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_6) -; RV32-NEXT: vle16.v v20, (a1) -; RV32-NEXT: vle16.v v8, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vle16.v v28, (a3) ; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v1, a1 +; RV32-NEXT: vmv.s.x v20, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs1r.v v20, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v8, v20 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v0, v16 +; RV32-NEXT: vmv1r.v v0, v20 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v4, v24 +; RV32-NEXT: vmv.v.v v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 3 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 -; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vi v8, v8, -4 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v4, v12 +; RV32-NEXT: vmv.v.v v24, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_8) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_8) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_9) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_9) ; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v20, (a3) +; RV32-NEXT: vle16.v v28, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v0, v16 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v4, v8 +; RV32-NEXT: vmv.v.v v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) @@ -468,25 +474,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v20, v16, 6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v20, v16, v10 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -501,13 +502,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a1, 1008 ; RV32-NEXT: vmv.s.x v28, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 29 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v28, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -515,7 +516,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -528,19 +529,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -553,33 +554,33 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: vle16.v v8, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 45 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a2, a1, 5 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 37 +; RV32-NEXT: li a2, 49 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 29 +; RV32-NEXT: li a2, 25 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 53 +; RV32-NEXT: li a2, 41 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 45 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a2, a1, 5 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload @@ -593,37 +594,35 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a3, a2, 4 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a3, a2, 3 -; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: slli a2, a2, 2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a3, a2, 4 -; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 25 +; RV32-NEXT: li a2, 21 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 62 +; RV32-NEXT: li a1, 58 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll index d0777962a75651..a34fa9502d93b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll @@ -8,13 +8,11 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: trn1.v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 @@ -24,13 +22,11 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: trn2.v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 @@ -40,16 +36,14 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: trn1.v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 -; CHECK-NEXT: vadd.vi v8, v11, -1 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 ; CHECK-NEXT: lui a0, 11 ; CHECK-NEXT: addi a0, a0, -1366 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 @@ -59,16 +53,14 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: trn2.v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 -; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: lui a0, 11 ; CHECK-NEXT: addi a0, a0, -1366 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 @@ -78,12 +70,10 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: trn1.v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 @@ -93,12 +83,10 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: trn2.v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 @@ -108,13 +96,11 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: trn1.v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 @@ -124,13 +110,11 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: trn2.v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 @@ -163,12 +147,10 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: trn1.v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 @@ -178,12 +160,10 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: trn2.v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 @@ -239,12 +219,10 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: trn1.v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 @@ -254,12 +232,10 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: trn2.v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 @@ -292,12 +268,10 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: trn1.v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 @@ -307,12 +281,10 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: trn2.v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 @@ -322,13 +294,11 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: trn1.v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v11, -1 -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vadd.vi v10, v10, -1 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0 @@ -338,13 +308,11 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: trn2.v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v11 -; CHECK-NEXT: vadd.vi v12, v11, 1 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0