diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5bb88e4a57dc3..eba54e30da60f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1182,6 +1182,7 @@ class VPInstruction : public VPRecipeWithIRFlags { SLPStore, ActiveLaneMask, ExplicitVectorLength, + ExplicitVectorLengthMask, CalculateTripCountMinusVF, // Increment the canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7a482455473e4..8e394e3cc8bb4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -137,6 +137,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::ExplicitVectorLengthMask: case VPInstruction::ExtractFromEnd: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: @@ -426,6 +427,14 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { Value *EVL = GetEVL(State, AVL); return EVL; } + case VPInstruction::ExplicitVectorLengthMask: { + assert(Part == 0 && "No unrolling expected for predicated vectorization."); + // Compute step < splat(evl) + Value *EVL = State.get(getOperand(0), VPIteration(0, 0)); + Value *SplatEVL = Builder.CreateVectorSplat(State.VF, EVL); + Value *Step = Builder.CreateStepVector(SplatEVL->getType()); + return Builder.CreateICmpULT(Step, SplatEVL, "evl.mask"); + } case VPInstruction::CanonicalIVIncrementForPart: { auto *IV = State.get(getOperand(0), VPIteration(0, 0)); if (Part == 0) @@ -672,6 +681,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return vputils::onlyFirstLaneUsed(this); case VPInstruction::ActiveLaneMask: case VPInstruction::ExplicitVectorLength: + case VPInstruction::ExplicitVectorLengthMask: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::BranchOnCount: @@ -730,6 +740,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ExplicitVectorLength: O << "EXPLICIT-VECTOR-LENGTH"; break; + case VPInstruction::ExplicitVectorLengthMask: + O << "EXPLICIT-VECTOR-LENGTH-MASK"; + break; case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8ec67eb2f54bd..d8c34013f8b3f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1426,6 +1426,23 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { {EVLPhi, Plan.getTripCount()}); VPEVL->insertBefore(*Header, Header->getFirstNonPhi()); + // Replace header mask pattern (ICmp::ule widen-canonical-IV, BTC) with a + // (ICmp::ult step-vector, EVL). + // TODO: Replace all users of the ExplicitVectorLengthMask recipe with + // EVL-series recipes wherever possible to ensure the final vplan does not use + // the mask. The ExplicitVectorLengthMask recipe is a temporary appoarch to + // handle situations requiring a header mask, such as out-loop (unordered) + // reductions. It is necessary to generate a mask different from the original + // header mask because the explict vector length of the second-to-last + // iteration may be smaller than VF*UF. + auto *EVLMask = + new VPInstruction(VPInstruction::ExplicitVectorLengthMask, {VPEVL}); + EVLMask->insertAfter(VPEVL); + for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { + HeaderMask->replaceAllUsesWith(EVLMask); + recursivelyDeleteDeadRecipes(HeaderMask); + } + auto *CanonicalIVIncrement = cast(CanonicalIVPHI->getBackedgeValue()); VPSingleDefRecipe *OpVPEVL = VPEVL; @@ -1444,29 +1461,28 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { - for (VPUser *U : collectUsersRecursively(HeaderMask)) { - auto *MemR = dyn_cast(U); - if (!MemR) - continue; - VPValue *OrigMask = MemR->getMask(); - assert(OrigMask && "Unmasked widen memory recipe when folding tail"); - VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; - if (auto *L = dyn_cast(MemR)) { - auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); - N->insertBefore(L); - L->replaceAllUsesWith(N); - L->eraseFromParent(); - } else if (auto *S = dyn_cast(MemR)) { - auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); - N->insertBefore(S); - S->eraseFromParent(); - } else { - llvm_unreachable("unsupported recipe"); - } + for (VPUser *U : collectUsersRecursively(EVLMask)) { + auto *MemR = dyn_cast(U); + if (!MemR) + continue; + VPValue *OrigMask = MemR->getMask(); + assert(OrigMask && "Unmasked widen memory recipe when folding tail"); + VPValue *NewMask = EVLMask == OrigMask ? nullptr : OrigMask; + if (auto *L = dyn_cast(MemR)) { + auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); + N->insertBefore(L); + L->replaceAllUsesWith(N); + L->eraseFromParent(); + } else if (auto *S = dyn_cast(MemR)) { + auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); + N->insertBefore(S); + S->eraseFromParent(); + } else { + llvm_unreachable("unsupported recipe"); } - recursivelyDeleteDeadRecipes(HeaderMask); } + recursivelyDeleteDeadRecipes(EVLMask); + // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll index 9d02ce715139e..3aecfc0d613e6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll @@ -20,43 +20,38 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] -; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]] -; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer -; IF-EVL-NEXT: [[TMP20:%.*]] = select [[TMP16]], [[TMP19]], zeroinitializer -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], [[TMP20]], i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = add [[VP_OP_LOAD]], [[VP_OP_LOAD3]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP23]], ptr align 4 [[TMP22]], [[TMP20]], i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP24:%.*]] = zext i32 [[TMP12]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] -; IF-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; IF-EVL-NEXT: [[EVL_MASK:%.*]] = icmp ult [[TMP11]], [[DOTSPLAT]] +; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer +; IF-EVL-NEXT: [[TMP16:%.*]] = select [[EVL_MASK]], [[TMP15]], zeroinitializer +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], [[TMP16]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = add [[VP_OP_LOAD]], [[VP_OP_LOAD1]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP19]], ptr align 4 [[TMP18]], [[TMP16]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -65,13 +60,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0 +; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP22]], 0 ; IF-EVL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; IF-EVL: if.then: ; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP22]], [[TMP23]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 ; IF-EVL-NEXT: br label [[FOR_INC]] ; IF-EVL: for.inc: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index d62f70c06a5fb..31bca31efe1e3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -16,46 +16,46 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 -; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], -1 -; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP13]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 -; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP20]] -; IF-EVL-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP20]] -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP8]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; IF-EVL-NEXT: [[TMP12:%.*]] = mul i64 0, [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = sub i64 1, [[TMP11]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP12]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP13]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 0, [[TMP18]] +; IF-EVL-NEXT: [[TMP20:%.*]] = sub i64 1, [[TMP18]] +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP19]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP20]] +; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -119,61 +119,59 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] ; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 -; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true) +; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; IF-EVL-NEXT: [[EVL_MASK:%.*]] = icmp ult [[TMP7]], [[DOTSPLAT]] ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 ; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX3]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = add zeroinitializer, [[TMP11]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP12]] -; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IV]], shufflevector ( insertelement ( poison, i64 1023, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP14:%.*]] = add i64 [[TMP9]], -1 -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP10]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[TMP18:%.*]] = select [[TMP13]], [[TMP17]], zeroinitializer -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP21]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP21]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP25]], [[VP_REVERSE_MASK]], i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4 -; IF-EVL-NEXT: [[TMP29:%.*]] = mul i64 0, [[TMP28]] -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 1, [[TMP28]] -; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]] -; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP8]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP32]], [[VP_REVERSE_MASK6]], i32 [[TMP8]]) -; IF-EVL-NEXT: [[TMP33:%.*]] = zext i32 [[TMP8]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; IF-EVL-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX3]], 0 +; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[TMP8]], -1 +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP9]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP14:%.*]] = select [[EVL_MASK]], [[TMP13]], zeroinitializer +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 0, [[TMP17]] +; IF-EVL-NEXT: [[TMP19:%.*]] = sub i64 1, [[TMP17]] +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP18]] +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP20]], i64 [[TMP19]] +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], [[VP_REVERSE_MASK]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4 +; IF-EVL-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP24]] +; IF-EVL-NEXT: [[TMP26:%.*]] = sub i64 1, [[TMP24]] +; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP25]] +; IF-EVL-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP27]], i64 [[TMP26]] +; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP28]], [[VP_REVERSE_MASK6]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP29:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP29]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; IF-EVL-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: