diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 082489f70f1c6..9a6d42e3cf7d0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6899,46 +6899,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, } } - /// Compute the cost of all exiting conditions of the loop using the legacy - /// cost model. This is to match the legacy behavior, which adds the cost of - /// all exit conditions. Note that this over-estimates the cost, as there will - /// be a single condition to control the vector loop. - SmallVector Exiting; - CM.TheLoop->getExitingBlocks(Exiting); - SetVector ExitInstrs; - // Collect all exit conditions. - for (BasicBlock *EB : Exiting) { - auto *Term = dyn_cast(EB->getTerminator()); - if (!Term || CostCtx.skipCostComputation(Term, VF.isVector())) - continue; - if (auto *CondI = dyn_cast(Term->getOperand(0))) { - ExitInstrs.insert(CondI); - } - } - // Compute the cost of all instructions only feeding the exit conditions. - for (unsigned I = 0; I != ExitInstrs.size(); ++I) { - Instruction *CondI = ExitInstrs[I]; - if (!OrigLoop->contains(CondI) || - !CostCtx.SkipCostComputation.insert(CondI).second) - continue; - InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); - LLVM_DEBUG({ - dbgs() << "Cost of " << CondICost << " for VF " << VF - << ": exit condition instruction " << *CondI << "\n"; - }); - Cost += CondICost; - for (Value *Op : CondI->operands()) { - auto *OpI = dyn_cast(Op); - if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) || - any_of(OpI->users(), [&ExitInstrs, this](User *U) { - return OrigLoop->contains(cast(U)->getParent()) && - !ExitInstrs.contains(cast(U)); - })) - continue; - ExitInstrs.insert(OpI); - } - } - // Pre-compute the costs for branches except for the backedge, as the number // of replicate regions in a VPlan may not directly match the number of // branches, which would lead to different decisions. @@ -7058,8 +7018,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } // Unused FOR splices are removed by VPlan transforms, so the VPlan-based // cost model won't cost it whilst the legacy will. + using namespace VPlanPatternMatch; if (auto *FOR = dyn_cast(&R)) { - using namespace VPlanPatternMatch; if (none_of(FOR->users(), match_fn(m_VPInstruction< VPInstruction::FirstOrderRecurrenceSplice>()))) @@ -7090,10 +7050,19 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, RepR->getUnderlyingInstr(), VF)) return true; } + + // The VPlan-based cost model knows that if TC <= VF then no compare is + // needed in branch-on-count, but the legacy cost model does not. + if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) { + Value *TC = Plan.getTripCount()->getUnderlyingValue(); + ConstantInt *TCConst = dyn_cast_if_present(TC); + if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue())) + return true; + } + if (Instruction *UI = GetInstructionForCost(&R)) { // If we adjusted the predicate of the recipe, the cost in the legacy // cost model may be different. - using namespace VPlanPatternMatch; CmpPredicate Pred; if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) && cast(R).getPredicate() != @@ -7104,6 +7073,37 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } } + // The VPlan may have been transformed such that the original loop exit + // condition, and the instructions that are used only by it, no longer + // exists, but we expect the transformed version to have the same cost. + // Therefore mark all such instructions as seen. + SmallVector Exiting; + CostCtx.CM.TheLoop->getExitingBlocks(Exiting); + SetVector ExitInstrs; + // Collect all exit conditions. + for (BasicBlock *EB : Exiting) { + auto *Term = dyn_cast(EB->getTerminator()); + if (!Term) + continue; + if (auto *CondI = dyn_cast(Term->getOperand(0))) + ExitInstrs.insert(CondI); + } + // Collect all instructions only feeding the exit conditions. + for (unsigned I = 0; I != ExitInstrs.size(); ++I) { + Instruction *CondI = ExitInstrs[I]; + SeenInstrs.insert(CondI); + for (Value *Op : CondI->operands()) { + auto *OpI = dyn_cast(Op); + if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) || + any_of(OpI->users(), [&ExitInstrs, TheLoop](User *U) { + return TheLoop->contains(cast(U)->getParent()) && + !ExitInstrs.contains(cast(U)); + })) + continue; + ExitInstrs.insert(OpI); + } + } + // Return true if the loop contains any instructions that are not also part of // the VPlan or are skipped for VPlan-based cost computations. This indicates // that the VPlan contains extra simplifications. @@ -7119,6 +7119,39 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, }); }); } + +static bool planContainsDifferentCompares(VPlan &Plan, VPCostContext &CostCtx, + Loop *TheLoop, ElementCount VF) { + // Count how many compare instructions there are in the legacy cost model. + unsigned NumLegacyCompares = 0; + for (BasicBlock *BB : TheLoop->blocks()) { + for (auto &I : *BB) { + if (isa(I)) { + NumLegacyCompares += 1; + } + } + } + + // Count how many compare instructions there are in the VPlan. + unsigned NumVPlanCompares = 0; + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + auto Iter = vp_depth_first_deep(VectorRegion->getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { + // Only the blocks in the vector region are relevant. + if (VPBB->getEnclosingLoopRegion() != VectorRegion) + continue; + for (VPRecipeBase &R : *VPBB) { + using namespace VPlanPatternMatch; + if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) || + match(&R, m_Cmp(m_VPValue(), m_VPValue()))) + NumVPlanCompares += 1; + } + } + + // If we have a different amount, then the legacy cost model and vplan will + // disagree. + return NumLegacyCompares != NumVPlanCompares; +} #endif VectorizationFactor LoopVectorizationPlanner::computeBestVF() { @@ -7226,6 +7259,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // * VPlans with additional VPlan simplifications, // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses // vp_scatter/vp_gather). + // * VPlans containing a different number of compare instructions to what's + // present in the original scalar loop. // The legacy cost model doesn't properly model costs for such loops. bool UsesEVLGatherScatter = any_of(VPBlockUtils::blocksOnly(vp_depth_first_shallow( @@ -7242,7 +7277,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { planContainsAdditionalSimplifications( getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) || planContainsAdditionalSimplifications( - getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) && + getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width) || + planContainsDifferentCompares(BestPlan, CostCtx, OrigLoop, + BestFactor.Width)) && " VPlan cost model and legacy cost model disagreed"); assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && "when vectorizing, the scalar cost must be computed."); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index fb7aaec4e93a9..740c61a665e8b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1109,6 +1109,30 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy, Ctx.CostKind, 0); } + case VPInstruction::BranchOnCount: { + // If TC <= VF then this is just a branch. + // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF + // where it checks TC <= VF * UF, but we don't know UF yet. This means in + // some cases we get a cost that's too high due to counting a cmp that + // later gets removed. + // FIXME: The compare could also be removed if TC = M * vscale, + // VF = N * vscale, and M <= N. Detecting that would require having the + // trip count as a SCEV though. + Value *TC = getParent()->getPlan()->getTripCount()->getUnderlyingValue(); + ConstantInt *TCConst = dyn_cast_if_present(TC); + if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue())) + return 0; + // Otherwise BranchOnCount generates ICmpEQ followed by a branch. + Type *ValTy = Ctx.Types.inferScalarType(getOperand(0)); + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, + CmpInst::makeCmpResultType(ValTy), + CmpInst::ICMP_EQ, Ctx.CostKind); + } + case Instruction::FCmp: + case Instruction::ICmp: + return getCostForRecipeWithOpcode( + getOpcode(), + vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx); case VPInstruction::ExtractPenultimateElement: if (VF == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 4f2933ad2f85c..d8abb028787f5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -290,10 +290,10 @@ define void @latch_branch_cost(ptr %dst) { ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ] -; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 99) -; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] +; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] +; PRED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 99) +; PRED-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 ; PRED-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; PRED: [[PRED_STORE_IF]]: ; PRED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 @@ -301,7 +301,7 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: store i8 0, ptr [[TMP3]], align 1 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] ; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; PRED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 ; PRED-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] ; PRED: [[PRED_STORE_IF1]]: ; PRED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 @@ -309,7 +309,7 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: store i8 0, ptr [[TMP6]], align 1 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; PRED-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 ; PRED-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] ; PRED: [[PRED_STORE_IF3]]: ; PRED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 @@ -317,50 +317,18 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: store i8 0, ptr [[TMP9]], align 1 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; PRED: [[PRED_STORE_CONTINUE4]]: -; PRED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 -; PRED-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; PRED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; PRED-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] ; PRED: [[PRED_STORE_IF5]]: ; PRED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3 -; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] -; PRED-NEXT: store i8 0, ptr [[TMP12]], align 1 +; PRED-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; PRED-NEXT: store i8 0, ptr [[TMP24]], align 1 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; PRED: [[PRED_STORE_CONTINUE6]]: -; PRED-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 -; PRED-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] -; PRED: [[PRED_STORE_IF7]]: -; PRED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]] -; PRED-NEXT: store i8 0, ptr [[TMP15]], align 1 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE8]] -; PRED: [[PRED_STORE_CONTINUE8]]: -; PRED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 -; PRED-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] -; PRED: [[PRED_STORE_IF9]]: -; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5 -; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]] -; PRED-NEXT: store i8 0, ptr [[TMP18]], align 1 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE10]] -; PRED: [[PRED_STORE_CONTINUE10]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 -; PRED-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] -; PRED: [[PRED_STORE_IF11]]: -; PRED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6 -; PRED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]] -; PRED-NEXT: store i8 0, ptr [[TMP21]], align 1 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE12]] -; PRED: [[PRED_STORE_CONTINUE12]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 -; PRED-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]] -; PRED: [[PRED_STORE_IF13]]: -; PRED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7 -; PRED-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]] -; PRED-NEXT: store i8 0, ptr [[TMP24]], align 1 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE14]] -; PRED: [[PRED_STORE_CONTINUE14]]: -; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8) -; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104 -; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4) +; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[EXIT]]: @@ -522,23 +490,23 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-LABEL: define void @multiple_exit_conditions( ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 257, [[TMP3]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP0]], 4 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP4]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: -; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP4]], 4 -; DEFAULT-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP11]], 4 -; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP5]] -; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]] -; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[N_VEC]], 8 +; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP2]], 4 +; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP11]], 4 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]] +; DEFAULT-NEXT: [[INDEX:%.*]] = sub i64 257, [[N_MOD_VF]] +; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] -; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2 +; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 2 ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; DEFAULT-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX1]], 8 ; DEFAULT-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2 ; DEFAULT-NEXT: [[TMP8:%.*]] = or i16 [[TMP1]], 1 @@ -554,11 +522,11 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP12]], align 8 ; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP15]], align 8 ; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP18]], align 8 -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; DEFAULT-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP3]] +; DEFAULT-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[INDEX]] ; DEFAULT-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]] +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[INDEX]] ; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] ; DEFAULT: [[SCALAR_PH]]: ; @@ -1264,7 +1232,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { ; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]] ; PRED: [[VECTOR_MEMCHECK]]: ; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16 +; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 ; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[C1]], [[A2]] ; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] ; PRED-NEXT: [[TMP4:%.*]] = sub i64 [[C1]], [[B3]] @@ -1273,42 +1241,42 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { ; PRED-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16 +; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 ; PRED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4 +; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2 ; PRED-NEXT: [[TMP9:%.*]] = sub i64 [[TMP0]], [[TMP8]] ; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], [[TMP8]] ; PRED-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[Y]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[Y]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP13:%.*]] = uitofp [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP12]], [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP13:%.*]] = uitofp [[WIDE_MASKED_LOAD]] to ; PRED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP15:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer -; PRED-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer -; PRED-NEXT: [[TMP17:%.*]] = xor [[WIDE_MASKED_LOAD]], splat (i8 1) -; PRED-NEXT: [[TMP18:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i8 1) -; PRED-NEXT: [[TMP19:%.*]] = udiv [[TMP17]], [[TMP18]] -; PRED-NEXT: [[TMP20:%.*]] = icmp ugt [[TMP19]], splat (i8 1) -; PRED-NEXT: [[TMP21:%.*]] = select [[TMP20]], zeroinitializer, splat (i32 255) -; PRED-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP21]], zeroinitializer -; PRED-NEXT: [[TMP22:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; PRED-NEXT: [[TMP23:%.*]] = sub [[PREDPHI]], [[TMP22]] -; PRED-NEXT: [[TMP24:%.*]] = sitofp [[TMP23]] to -; PRED-NEXT: [[TMP25:%.*]] = call @llvm.fmuladd.nxv16f32( [[TMP24]], splat (float 3.000000e+00), [[TMP13]]) -; PRED-NEXT: [[TMP26:%.*]] = fptoui [[TMP25]] to +; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP15:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer +; PRED-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; PRED-NEXT: [[TMP17:%.*]] = xor [[WIDE_MASKED_LOAD]], splat (i8 1) +; PRED-NEXT: [[TMP18:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i8 1) +; PRED-NEXT: [[TMP19:%.*]] = udiv [[TMP17]], [[TMP18]] +; PRED-NEXT: [[TMP20:%.*]] = icmp ugt [[TMP19]], splat (i8 1) +; PRED-NEXT: [[TMP21:%.*]] = select [[TMP20]], zeroinitializer, splat (i32 255) +; PRED-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP21]], zeroinitializer +; PRED-NEXT: [[TMP22:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[TMP23:%.*]] = sub [[PREDPHI]], [[TMP22]] +; PRED-NEXT: [[TMP24:%.*]] = sitofp [[TMP23]] to +; PRED-NEXT: [[TMP25:%.*]] = call @llvm.fmuladd.nxv4f32( [[TMP24]], splat (float 3.000000e+00), [[TMP13]]) +; PRED-NEXT: [[TMP26:%.*]] = fptoui [[TMP25]] to ; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]] -; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP26]], ptr align 1 [[TMP27]], [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP26]], ptr align 1 [[TMP27]], [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]]) -; PRED-NEXT: [[TMP28:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; PRED-NEXT: [[TMP28:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP29:%.*]] = xor i1 [[TMP28]], true ; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index f645db16ed3c6..7ebc7a6d4849b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -9,11 +9,12 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK-LABEL: LV: Checking a loop in 'test' ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 56 ; CHECK: LV: Selecting VF: 16 entry: @@ -43,12 +44,13 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-LABEL: LV: Checking a loop in 'test_external_iv_user' ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 57 ; CHECK: LV: Selecting VF: 16 entry: @@ -80,8 +82,8 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 16 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] @@ -117,11 +119,14 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef ; CHECK-LABEL: LV: Checking a loop in 'test_extra_cmp_user' ; CHECK: Cost of 4 for VF 8: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 12 +; CHECK: Cost of 4 for VF 8: WIDEN ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<16> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost for VF 8: 13 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: WIDEN ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<16> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 4 ; CHECK: LV: Selecting VF: 16 entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index 4cacc30a714b6..2992b01285160 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -124,38 +124,38 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]] ; PRED: [[VECTOR_MEMCHECK]]: ; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16 +; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8 ; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]] ; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] ; PRED-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4 +; PRED-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 3 ; PRED-NEXT: [[TMP8:%.*]] = sub i64 [[TMP0]], [[TMP7]] ; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP0]], [[TMP7]] ; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[TMP11:%.*]] = trunc [[BROADCAST_SPLAT]] to +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[TMP11:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]] -; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP13:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; PRED-NEXT: [[TMP14:%.*]] = mul [[TMP13]], [[TMP11]] -; PRED-NEXT: [[TMP15:%.*]] = or [[TMP14]], [[TMP13]] -; PRED-NEXT: [[TMP16:%.*]] = lshr [[TMP15]], splat (i16 1) -; PRED-NEXT: [[TMP17:%.*]] = trunc [[TMP16]] to +; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP12]], [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP13:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[TMP14:%.*]] = mul [[TMP13]], [[TMP11]] +; PRED-NEXT: [[TMP15:%.*]] = or [[TMP14]], [[TMP13]] +; PRED-NEXT: [[TMP16:%.*]] = lshr [[TMP15]], splat (i16 1) +; PRED-NEXT: [[TMP17:%.*]] = trunc [[TMP16]] to ; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] -; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP17]], ptr align 1 [[TMP18]], [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP17]], ptr align 1 [[TMP18]], [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP10]]) -; PRED-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]]) +; PRED-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true ; PRED-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 40db6a53b49e4..ad371c5ce889f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -94,7 +94,7 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) { ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 10 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll index dd6f0fe5f1292..d8e313f28f7a5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -16,7 +16,7 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1 -; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP3]], i32 6) +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP3]], i32 8) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[UMAX]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll index c1b175f39e852..d4b1ade32b19b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll @@ -23,16 +23,16 @@ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize --disable-output < %s 2>&1 \ ; RUN: | FileCheck %s --check-prefixes=NEOVERSEV2 -; VSCALEFORTUNING1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5. -; VSCALEFORTUNING1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2. +; VSCALEFORTUNING1: Cost for VF vscale x 2: 10 (Estimated cost per lane: 5. +; VSCALEFORTUNING1: Cost for VF vscale x 4: 10 (Estimated cost per lane: 2. ; VSCALEFORTUNING1: LV: Selecting VF: vscale x 16 -; VSCALEFORTUNING2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2. -; VSCALEFORTUNING2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1. +; VSCALEFORTUNING2: Cost for VF vscale x 2: 10 (Estimated cost per lane: 2. +; VSCALEFORTUNING2: Cost for VF vscale x 4: 10 (Estimated cost per lane: 1. ; VSCALEFORTUNING2: LV: Selecting VF: vscale x 16 -; NEOVERSEV2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5. -; NEOVERSEV2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2. +; NEOVERSEV2: Cost for VF vscale x 2: 10 (Estimated cost per lane: 5. +; NEOVERSEV2: Cost for VF vscale x 4: 10 (Estimated cost per lane: 2. ; NEOVERSEV2: LV: Selecting VF: 16 ; VSCALEFORTUNING1: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index 66211bd0353d4..63348ccf94f78 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -386,7 +386,7 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[END]] to i10 ; CHECK-NEXT: [[TMP1:%.*]] = zext i10 [[TMP0]] to i64 ; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 12 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll index baf050c7facee..0e2ada9e0c74e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll @@ -45,7 +45,7 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; NORMAL_DEP_LIMIT-NEXT: entry: ; NORMAL_DEP_LIMIT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NORMAL_DEP_LIMIT-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; NORMAL_DEP_LIMIT-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; NORMAL_DEP_LIMIT-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 12) ; NORMAL_DEP_LIMIT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]] ; NORMAL_DEP_LIMIT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; NORMAL_DEP_LIMIT: vector.memcheck: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll index 64909a7c2fcec..13c21218381df 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -21,7 +21,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: LV: Scalar loop costs: 5. ; CHECK: Cost of 1 for VF 2: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 2: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -34,11 +33,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 16 for VF 2: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 2: 86 (Estimated cost per lane: 43. ; CHECK: Cost of 1 for VF 4: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 4: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -51,11 +49,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 2 for VF 4: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 4: 10 (Estimated cost per lane: 2. ; CHECK: Cost of 1 for VF 8: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 8: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -68,7 +65,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 2 for VF 8: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 46 (Estimated cost per lane: 5. ; CHECK: LV: Selecting VF: 4. define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 { @@ -134,7 +131,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 2: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 2: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -156,7 +152,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 2: 130 (Estimated cost per lane: 65. ; CHECK: Cost of 1 for VF 4: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 4: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -166,7 +162,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 4: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 4: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 4: EMIT vp<[[CAN_IV:%.]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -188,7 +183,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 4: 14 (Estimated cost per lane: 3. ; CHECK: Cost of 1 for VF 8: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 8: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -198,7 +193,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 8: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 8: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 8: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 8: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -220,7 +214,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}} -; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 26 (Estimated cost per lane: 3. ; CHECK: Cost of 1 for VF 16: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 16: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -230,7 +224,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 16: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 16: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 16: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 16: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -252,7 +245,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 50 ; CHECK: LV: Selecting VF: 16. define void @cheap_icmp(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) #0 { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll index 9f747dd6222a2..308c29e4a6013 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll @@ -12,7 +12,7 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP8]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 15) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 17) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -73,7 +73,7 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 15) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 17) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: @@ -157,7 +157,7 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP8]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 15) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 17) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -218,7 +218,7 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 15) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 17) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: @@ -302,7 +302,7 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP8]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 15) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 17) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -363,7 +363,7 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 15) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 17) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: @@ -447,7 +447,7 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP8]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 15) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP13]], i64 17) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -508,7 +508,7 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 15) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 17) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 7e6e45feaa834..08bbb9ad661f6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -16,7 +16,8 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; OUTLOOP: for.body.preheader: ; OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; OUTLOOP-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2 -; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] +; OUTLOOP-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 8) +; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[UMAX]] ; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OUTLOOP: vector.ph: ; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() @@ -66,7 +67,8 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; INLOOP: for.body.preheader: ; INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; INLOOP-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 3 -; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] +; INLOOP-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 16) +; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[UMAX]] ; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INLOOP: vector.ph: ; INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() @@ -195,7 +197,8 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; OUTLOOP-NEXT: entry: ; OUTLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; OUTLOOP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; OUTLOOP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[UMAX]] ; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OUTLOOP: vector.ph: ; OUTLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -241,7 +244,8 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; INLOOP-NEXT: entry: ; INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; INLOOP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; INLOOP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[UMAX]] ; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INLOOP: vector.ph: ; INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -301,7 +305,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP15]]) ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] @@ -326,7 +330,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] ; IF-EVL-INLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-INLOOP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll index 328ee16a92db4..27a9ceeb6a0b9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll @@ -9,7 +9,8 @@ define void @load_store(ptr %p) { ; LMUL1-LABEL: @load_store( ; LMUL1-NEXT: entry: ; LMUL1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; LMUL1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 2) +; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; LMUL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL1: vector.ph: ; LMUL1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 78b54efe7e682..81b0ef6fc2a8d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -125,11 +125,11 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 8) -; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 8) -; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] -; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0( [[TMP7]], ptr align 1 [[DST]], splat (i1 true), i32 8) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 8) +; CHECK-NEXT: [[TMP0:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = add [[TMP0]], [[VP_OP_LOAD1]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0( [[TMP1]], ptr align 1 [[DST]], splat (i1 true), i32 8) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 1ae1ba6795c01..117be46ec737d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -15,7 +15,8 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; V-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; V-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; V: vector.ph: ; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -48,7 +49,8 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: entry: ; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVQDOTQ-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; ZVQDOTQ-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; ZVQDOTQ: vector.ph: ; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -208,7 +210,8 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; V-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; V-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; V: vector.ph: ; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -241,7 +244,8 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: entry: ; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVQDOTQ-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; ZVQDOTQ-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; ZVQDOTQ: vector.ph: ; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -401,7 +405,8 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; V-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; V-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; V: vector.ph: ; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -434,7 +439,8 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: entry: ; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVQDOTQ-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; ZVQDOTQ-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; ZVQDOTQ: vector.ph: ; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -593,7 +599,8 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; V-NEXT: entry: ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; V-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; V-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; V: vector.ph: ; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -626,7 +633,8 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: entry: ; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVQDOTQ-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; ZVQDOTQ-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; ZVQDOTQ: vector.ph: ; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index d156d0c501e1f..b52e3346acd28 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -664,7 +664,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-UF2-NEXT: [[P21:%.*]] = ptrtoint ptr [[P2:%.*]] to i64 ; NOSTRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; NOSTRIDED-UF2-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 12) +; NOSTRIDED-UF2-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 14) ; NOSTRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NOSTRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; NOSTRIDED-UF2: vector.scevcheck: @@ -816,7 +816,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: entry: ; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; STRIDED-UF2-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 79) +; STRIDED-UF2-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 80) ; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX9]] ; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; STRIDED-UF2: vector.scevcheck: @@ -1172,7 +1172,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: entry: ; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; STRIDED-UF2-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 28) +; STRIDED-UF2-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 30) ; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX6]] ; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; STRIDED-UF2: vector.memcheck: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll index f5ff512b94123..6ae7459dca0c0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll @@ -1106,7 +1106,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) { ; NO-VP-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 ; NO-VP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP9]], 2 -; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 16) ; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; NO-VP: [[VECTOR_MEMCHECK]]: @@ -1194,7 +1194,7 @@ define void @log10(ptr %a, ptr %b, i64 %N) { ; NO-VP-NEXT: [[ENTRY:.*]]: ; NO-VP-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64 ; NO-VP-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 12 ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; NO-VP: [[VECTOR_MEMCHECK]]: ; NO-VP-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B2]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll index 317bcde2f4670..a02d9b7d9a247 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll @@ -1214,7 +1214,8 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) { ; NO-VP-NEXT: [[ENTRY:.*]]: ; NO-VP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP13]], 1 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 4) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NO-VP: [[VECTOR_PH]]: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll index 8cd540c3888db..9d45c894c6c88 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll @@ -41,7 +41,8 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: ; NO-VP-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP12]], i64 4) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NO-VP: [[VECTOR_PH]]: ; NO-VP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() @@ -137,7 +138,8 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: ; NO-VP-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP12]], i64 4) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NO-VP: [[VECTOR_PH]]: ; NO-VP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() @@ -232,7 +234,8 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: ; NO-VP-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP12]], i64 4) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NO-VP: [[VECTOR_PH]]: ; NO-VP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() @@ -327,7 +330,8 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]: ; NO-VP-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP12]], i64 4) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NO-VP: [[VECTOR_PH]]: ; NO-VP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index b5662b0bd8d3b..bd920fe18aacb 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -277,7 +277,8 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) { ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() @@ -518,7 +519,8 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) { ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() @@ -635,7 +637,8 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; NO-VP-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[SMIN]] ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll index 1aea6aaff66a3..e9b6d684ec54a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll @@ -34,7 +34,8 @@ define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) { ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; NO-VP-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP0]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP10]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP10]], i32 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll index e9dcf8f0f0395..3ae9e51f71ec2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll @@ -38,7 +38,8 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP12]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP12]], i64 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll index 13990000585ea..6c8449c8ec230 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll @@ -51,7 +51,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -174,7 +175,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[ENTRY:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll index e1c62fe2d043d..77992691ebff7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll @@ -39,7 +39,8 @@ define void @test(ptr %p) { ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 4) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -111,7 +112,7 @@ define void @test_may_clobber1(ptr %p) { ; IF-EVL-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32 ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: exit: @@ -221,7 +222,7 @@ define void @test_may_clobber3(ptr %p) { ; IF-EVL-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32 ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: exit: @@ -285,7 +286,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP5]], [[TMP13]] ; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: exit: @@ -295,7 +296,8 @@ define void @trivial_due_max_vscale(ptr %p) { ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 4) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -372,7 +374,7 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: exit: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll index d4d7d398185a1..71eae95506455 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll @@ -38,7 +38,8 @@ define void @load_store_interleave_group(ptr noalias %data) { ; EPILOGUE-NEXT: [[ENTRY:.*]]: ; EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; EPILOGUE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; EPILOGUE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 4) +; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[UMAX]] ; EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; EPILOGUE: [[VECTOR_PH]]: ; EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -137,7 +138,8 @@ define void @load_store_interleave_group_i32(ptr noalias %data) { ; EPILOGUE-NEXT: [[ENTRY:.*]]: ; EPILOGUE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; EPILOGUE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; EPILOGUE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[UMAX]] ; EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; EPILOGUE: [[VECTOR_PH]]: ; EPILOGUE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index f4c7c6f6fba1b..837dddb132917 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -250,10 +250,74 @@ exit: define void @test_minbws_for_trunc(i32 %n, ptr noalias %p1, ptr noalias %p2) { ; CHECK-LABEL: define void @test_minbws_for_trunc( ; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[MUL:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 4, i16 255) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i16, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i16, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i16 4, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i16 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 255) +; CHECK-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P2]], i64 [[MUL_RESULT2]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[P2]] +; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P2]], i64 2042 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[P2]], i64 1021 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[P2]], i64 8168 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[P2]], [[SCEVGEP4]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[P2]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[P2]], [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[P2]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: [[BOUND09:%.*]] = icmp ult ptr [[P2]], [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ult ptr [[P2]], [[SCEVGEP4]] +; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; CHECK-NEXT: [[CONFLICT_RDX12:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX12]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i16() +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i16 4) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 256, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[TMP9]] to i16 +; CHECK-NEXT: [[TMP11:%.*]] = mul i16 4, [[TMP10]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = sext [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[P1]], [[TMP12]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i32.nxv2p0( align 4 [[TMP13]], splat (i1 true), i32 [[TMP9]]) +; CHECK-NEXT: [[TMP14:%.*]] = trunc [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [1 x [1 x i16]], ptr [[P2]], [[TMP12]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i16.nxv2p0( [[TMP14]], align 2 [[TMP15]], splat (i1 true), i32 [[TMP9]]), !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[P2]], [[TMP12]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( [[TMP16]], align 1 [[TMP17]], splat (i1 true), i32 [[TMP9]]), !alias.scope [[META12:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[P2]], [[TMP12]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( zeroinitializer, align 8 [[TMP18]], splat (i1 true), i32 [[TMP9]]), !alias.scope [[META13]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[IV_EXT:%.*]] = sext i16 [[IV]] to i64 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV_EXT]] ; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[GEP1]], align 4 @@ -268,7 +332,7 @@ define void @test_minbws_for_trunc(i32 %n, ptr noalias %p1, ptr noalias %p2) { ; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 4 ; CHECK-NEXT: [[IV_NEXT_EXT:%.*]] = sext i16 [[IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[IV_NEXT_EXT]], 1024 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -307,4 +371,14 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK: [[META9]] = !{[[META10:![0-9]+]], [[META11:![0-9]+]]} +; CHECK: [[META10]] = distinct !{[[META10]], [[META8]]} +; CHECK: [[META11]] = distinct !{[[META11]], [[META8]]} +; CHECK: [[META12]] = !{[[META10]]} +; CHECK: [[META13]] = !{[[META11]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll index 0a64723b6ff9d..ff9168fe73755 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll @@ -38,7 +38,8 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP-NEXT: entry: ; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[UMAX]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: ; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll index 78c71fd3beb89..07ce38a40333b 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll @@ -16,25 +16,25 @@ define void @test_scalar_steps_target_instruction_cost(ptr %dst) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IV]], splat (i64 8) +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IV]], splat (i64 12) ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 ; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP2]] -; CHECK-NEXT: store i64 [[TMP2]], ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP5]] +; CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP6]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] ; CHECK: [[PRED_STORE_CONTINUE]]: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 ; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_IF1]]: -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 3 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP5]] -; CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP8]] +; CHECK-NEXT: store i64 [[TMP8]], ptr [[TMP9]], align 8 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 14 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] @@ -49,7 +49,7 @@ loop: %gep = getelementptr inbounds i64, ptr %dst, i64 %iv store i64 %iv, ptr %gep, align 8 %iv.next = add nuw nsw i64 %iv, 3 - %cmp = icmp ult i64 %iv, 22 + %cmp = icmp ult i64 %iv, 34 br i1 %cmp, label %loop, label %exit exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll index 3c39a1d4e2463..7d6455bae7038 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll @@ -9,10 +9,9 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK-LABEL: 'wide_or_replaced_with_add_vpinstruction' ; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> +; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> ; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer inbounds ir<%g.src> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%l> = load vp<%5> @@ -23,14 +22,21 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%g.dst> ; CHECK: Cost of 1 for VF 2: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK: Cost of 0 for VF 2: vector loop backedge +; CHECK: Cost of 0 for VF 2: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 2: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 2: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: Cost of 0 for VF 2: IR %l = load i64, ptr %g.src, align 8 +; CHECK: Cost of 0 for VF 2: IR %iv.4 = add nuw nsw i64 %iv, 4 +; CHECK: Cost of 0 for VF 2: IR %c = icmp ule i64 %l, 128 +; CHECK: Cost of 1 for VF 2: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<%2> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-cond vp<%cmp.n> ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; CHECK: Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> +; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> ; CHECK: Cost of 0 for VF 4: vp<%5> = vector-pointer inbounds ir<%g.src> ; CHECK: Cost of 1 for VF 4: WIDEN ir<%l> = load vp<%5> @@ -41,11 +47,18 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer ir<%g.dst> ; CHECK: Cost of 1 for VF 4: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK: Cost of 0 for VF 4: vector loop backedge +; CHECK: Cost of 0 for VF 4: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 4: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 4: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: Cost of 0 for VF 4: IR %l = load i64, ptr %g.src, align 8 +; CHECK: Cost of 0 for VF 4: IR %iv.4 = add nuw nsw i64 %iv, 4 +; CHECK: Cost of 0 for VF 4: IR %c = icmp ule i64 %l, 128 +; CHECK: Cost of 1 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<%2> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index 69505eb176f50..17dc704ffe327 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -174,16 +174,34 @@ attributes #0 = { "target-cpu"="knl" } define void @PR40816() #1 { ; CHECK-LABEL: define void @PR40816( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: store i32 [[TMP0]], ptr @b, align 1 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 2 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[TMP0]], 1 -; CHECK-NEXT: br i1 [[CMP2]], label %[[RETURN:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: store i32 0, ptr @b, align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; CHECK: [[PRED_STORE_IF1]]: +; CHECK-NEXT: store i32 1, ptr @b, align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_CONTINUE2]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[RETURN:.*]] +; CHECK: [[PRED_STORE_IF3]]: +; CHECK-NEXT: store i32 2, ptr @b, align 1 +; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[RETURN]]: -; CHECK-NEXT: ret void +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; CHECK: [[PRED_STORE_IF5]]: +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_CONTINUE6]]: +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[RETURN1:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] ; ; FORCE-LABEL: define void @PR40816( ; FORCE-SAME: ) #[[ATTR1:[0-9]+]] { diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index f65a9d7d45ed8..ebde58dc1ffef 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -199,49 +199,23 @@ exit: define void @redundant_and_1(ptr %dst, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: @redundant_and_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> , <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> , <4 x i1> [[TMP2]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 0 -; CHECK-NEXT: store i32 0, ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 1 -; CHECK-NEXT: store i32 0, ptr [[TMP12]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 2 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[PRED_STORE_IF3:%.*]] ] +; CHECK-NEXT: br i1 [[TMP10:%.*]], label [[PRED_STORE_IF3]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: then.1: +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 2 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], false +; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR]], i1 [[C_1:%.*]], i1 false +; CHECK-NEXT: br i1 [[COND]], label [[THEN_2:%.*]], label [[PRED_STORE_IF3]] +; CHECK: then.2: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[IV]] ; CHECK-NEXT: store i32 0, ptr [[TMP15]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 3 -; CHECK-NEXT: store i32 0, ptr [[TMP18]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] +; CHECK-NEXT: br label [[PRED_STORE_IF3]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[PRED_STORE_CONTINUE]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 801f910c5e13d..6a339db51d2a6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -827,54 +827,54 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N]], 4294967295 ; CHECK-NEXT: br i1 [[TMP2]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 32 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) -; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[STEP_ADD_2]] to <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i32> [[STEP_ADD_3]] to <4 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP15]] = or <4 x i32> [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP16]] = or <4 x i32> [[TMP12]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP17]] = or <4 x i32> [[TMP13]], [[VEC_PHI3]] -; CHECK-NEXT: [[TMP18]] = or <4 x i32> [[TMP14]], [[VEC_PHI4]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <8 x i32> [[STEP_ADD]], splat (i32 8) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <8 x i32> [[STEP_ADD_2]], splat (i32 8) +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[VEC_IND]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[STEP_ADD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[STEP_ADD_2]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i32> [[STEP_ADD_3]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP15]] = or <8 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP16]] = or <8 x i32> [[TMP12]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP17]] = or <8 x i32> [[TMP13]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP18]] = or <8 x i32> [[TMP14]], [[VEC_PHI4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[STEP_ADD_3]], splat (i32 8) ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <4 x i32> [[TMP18]], [[BIN_RDX5]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX6]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <8 x i32> [[TMP17]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <8 x i32> [[TMP18]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[BIN_RDX6]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF26:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -898,7 +898,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i32 [[INDEX9]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND10]], splat (i32 4) ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT15]], [[N_VEC8]] -; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP25]]) ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC8]] @@ -915,7 +915,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[SELECT_I:%.*]] = select i1 [[EXITCOND]], i32 0, i32 2 ; CHECK-NEXT: [[SELECT_NEXT]] = or i32 [[SELECT_I]], [[SELECT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SELECT_NEXT_LCSSA:%.*]] = phi i32 [ [[SELECT_NEXT]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SELECT_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 04bff3c393f62..9d80f54b6c49a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -395,37 +395,18 @@ exit: define i16 @iv_and_step_trunc() { ; CHECK-LABEL: define i16 @iv_and_step_trunc() { ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <16 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <16 x i64> [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[TMP1:%.*]] = trunc <16 x i64> [[TMP0]] to <16 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i16> [[VEC_IND1]], [[TMP1]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) -; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <16 x i16> [[VEC_IND1]], splat (i16 16) -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i16> [[TMP2]], i32 15 ; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: scalar.ph: -; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 96, [[LOOP]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] -; CHECK-NEXT: [[REC:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[LOOP]] ], [ [[REC_NEXT:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[REC:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[REC_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[IV]] to i16 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[IV_NEXT]] to i16 -; CHECK-NEXT: [[REC_NEXT]] = mul i16 [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 100 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP1]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[IV_NEXT]] to i16 +; CHECK-NEXT: [[REC_NEXT]] = mul i16 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = phi i16 [ [[REC]], [[LOOP1]] ] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = phi i16 [ [[REC]], [[LOOP]] ] ; CHECK-NEXT: ret i16 [[VECTOR_RECUR_EXTRACT_FOR_PHI]] ; entry: @@ -438,7 +419,7 @@ loop: %0 = trunc i64 %iv to i16 %1 = trunc i64 %iv.next to i16 %rec.next = mul i16 %0, %1 - %ec = icmp eq i64 %iv, 100 + %ec = icmp eq i64 %iv, 1 br i1 %ec, label %exit, label %loop exit: @@ -487,7 +468,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8) ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -513,7 +494,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4) ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100 -; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: @@ -532,7 +513,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 100 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 ; @@ -567,55 +548,14 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) { ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK: loop.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 0 -; CHECK-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], 1 -; CHECK-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP1]], 2 -; CHECK-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP1]], 3 -; CHECK-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: store i32 [[TMP1]], ptr [[DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[INDEX]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT:%.*]], label [[VECTOR_BODY]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -699,7 +639,7 @@ define void @wombat(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -714,7 +654,7 @@ define void @wombat(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -769,7 +709,7 @@ define void @wombat2(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -785,7 +725,7 @@ define void @wombat2(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[MUL3]] to i60 ; CHECK-NEXT: [[TRUNC_1]] = trunc i60 [[TRUNC_0]] to i32 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -842,7 +782,7 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -858,7 +798,7 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 ; CHECK-NEXT: [[DEAD_AND:%.*]] = and i32 [[TRUNC]], 123 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -910,15 +850,12 @@ attributes #1 = { "target-cpu"="skylake-avx512" "target-features"="-avx512f" } ; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} ; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]]} ; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} -; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} -; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} +; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} +; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]} ; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]} ; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]], [[META2]]} -; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} -; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} -; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} -; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} -; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} -; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} +; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META2]], [[META1]]} +; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]], [[META2]]} +; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index 2809a77b36f1a..15cd87665fd86 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -27,23 +27,30 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP10]], 1 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP26]], i64 0 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[TMP12]], i64 -7 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP28]], align 1 -; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD]], <8 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP29]] = xor <8 x i8> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 -3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP26]], i64 -4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i64 -3 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD4]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP16]] = xor <4 x i8> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = xor <4 x i8> [[REVERSE5]], [[VEC_PHI3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], 8 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP31:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP29]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <4 x i8> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> [[BIN_RDX]]) ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP31]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll index a8c5bb0ee6f3b..dbfd193580512 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -115,18 +115,18 @@ define i32 @foo_minsize() #1 { ; AUTOVF-NEXT: br label %[[VECTOR_BODY:.*]] ; AUTOVF: [[VECTOR_BODY]]: ; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i64 0 -; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer -; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], -; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202) +; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0 +; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], +; AUTOVF-NEXT: [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], splat (i32 202) ; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]] -; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 [[TMP2]], <32 x i1> [[TMP1]], <32 x i8> poison) -; AUTOVF-NEXT: [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1) -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr align 1 [[TMP2]], <32 x i1> [[TMP1]]) -; AUTOVF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[TMP0]], <16 x i8> poison) +; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <16 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; AUTOVF-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> splat (i8 2), <16 x i8> splat (i8 1) +; AUTOVF-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP3]], ptr align 1 [[TMP2]], <16 x i1> [[TMP0]]) +; AUTOVF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; AUTOVF-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 208 +; AUTOVF-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AUTOVF: [[MIDDLE_BLOCK]]: ; AUTOVF-NEXT: br label %[[FOR_END:.*]] ; AUTOVF: [[FOR_END]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll index 9217c905945ac..78144eacc0411 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -38,7 +38,7 @@ define i32 @main(ptr %ptr) { ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], [[UMIN1]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 24 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 28 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[CONV3]], -1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll index c756a54ec6d2b..5ab80976c8c3e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll @@ -14,32 +14,32 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noundef align 8 dereferenceable_or_null(16) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: bb5: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_BODY]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_BODY]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_LATCH]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 8) -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i64 -3 -; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 8 [[TMP8]], <4 x i1> [[REVERSE]]) +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 14) +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 -3 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 8 [[TMP7]], <4 x i1> [[REVERSE]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4) -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_LATCH]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP_LATCH:%.*]] +; CHECK-NEXT: br label [[BB6:%.*]] ; CHECK: bb6: ; CHECK-NEXT: ret void ; @@ -60,7 +60,7 @@ bb18: ; preds = %loop.header loop.latch: ; preds = %bb18, %loop.header %iv.next = add nsw i64 %iv, -1 - %icmp22 = icmp eq i64 %iv.next, 90 + %icmp22 = icmp eq i64 %iv.next, 84 br i1 %icmp22, label %bb6, label %loop.header, !prof !22 bb6: diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll index 5a396f88b1a64..e73985f86dc46 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll @@ -12,7 +12,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; COST-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] ; COST-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 ; COST-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; COST-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; COST-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 ; COST-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; COST: [[VECTOR_PH]]: ; COST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll index ba62ebcd29ffb..0aa4d18c4b4e4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll @@ -30,8 +30,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: br ; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i32 %lftr.wideiv, %n -; CHECK: Cost of 0 for VF 2: exit condition instruction %lftr.wideiv = trunc i64 %indvars.iv.next to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi vp<{{.+}}>, vp<[[EXT:%.+]]> ; CHECK: Cost of 0 for VF 2: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> @@ -49,7 +47,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[TRUNC:%.+]]> = trunc ir<%add5> to i8 ; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[EXT]]> = zext vp<[[TRUNC]]> to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll index 7e6b5e932b6c6..570ac2919291d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll @@ -14,21 +14,21 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 ; CHECK-NEXT: br label %[[LOOP_1:.*]] ; CHECK: [[LOOP_1]]: -; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ZEXT]], %[[LOOP_1]] ] +; CHECK-NEXT: [[IV_1_LCSSA2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ZEXT]], %[[LOOP_1]] ] ; CHECK-NEXT: br i1 false, label %[[LOOP_1_EXIT:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_1_EXIT]]: -; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ] +; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1_LCSSA2]], %[[LOOP_1]] ] ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_3_PREHEADER:.*]] ; CHECK: [[LOOP_3_PREHEADER]]: ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[IV_1_LCSSA2]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <16 x i64> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr align 1 [[DST]], <16 x i1> [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> zeroinitializer, ptr align 1 [[DST]], <8 x i1> [[TMP0]]) ; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT_1_LOOPEXIT1:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll index 199bd5b31c70d..6debead36946c 100644 --- a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll @@ -82,7 +82,8 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP-DEF-LABEL: @foo( ; NO-VP-DEF-NEXT: entry: ; NO-VP-DEF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-DEF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP0]] +; NO-VP-DEF-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 2) +; NO-VP-DEF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[UMAX]] ; NO-VP-DEF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP-DEF: vector.ph: ; NO-VP-DEF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()