diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index f82fc588639dd..fe5f21ac61274 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -300,9 +300,17 @@ class LoopVectorizationLegality { /// masking. bool canFoldTailByMasking() const; + /// Returns true if all instructions in the loop support masking or + /// speculation. + /// + /// The mask may be loop-invariant if it represents a maximum safe dependence + /// distance (alias mask) or loop-variant if it is based on the induction + /// variable (e.g. tail-folding). + bool canMaskLoop() const; + /// Mark all respective loads/stores for masking. Must only be called when - /// tail-folding is possible. - void prepareToFoldTailByMasking(); + /// masking is possible. + void prepareToMaskLoop(); /// Returns the primary induction variable. PHINode *getPrimaryInduction() { return PrimaryInduction; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index d4083c49626fe..e3cf650ddb76b 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, return (ScalarOpdIdx == 2); case Intrinsic::experimental_vp_splice: return ScalarOpdIdx == 2 || ScalarOpdIdx == 4; + case Intrinsic::loop_dependence_war_mask: + return true; default: return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index e57e0cf636501..66638dec9256b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -2131,6 +2131,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { } } + if (!canMaskLoop()) + return false; + + LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); + + return true; +} + +bool LoopVectorizationLegality::canMaskLoop() const { // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; @@ -2139,17 +2148,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { SmallPtrSet TmpMaskedOp; for (BasicBlock *BB : TheLoop->blocks()) { if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n"); + LLVM_DEBUG(dbgs() << "LV: Cannot mask loop.\n"); return false; } } - LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); - return true; } -void LoopVectorizationLegality::prepareToFoldTailByMasking() { +void LoopVectorizationLegality::prepareToMaskLoop() { // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 54bb073eb4f81..1019849b1d011 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -668,6 +668,10 @@ class LoopVectorizationPlanner { void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const; + VPValue *materializeAliasMask(VPlan &Plan, + ArrayRef DiffChecks, + bool HasBranchWeights); + #ifndef NDEBUG /// \return The most profitable vectorization factor for the available VPlans /// and the cost of that VF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 69d2b9f2c1a28..71b033c4d1240 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized"); +STATISTIC(LoopsPartialAliasVectorized, + "Number of partial aliasing loops vectorized"); static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, @@ -198,6 +200,10 @@ static cl::opt VectorizeMemoryCheckThreshold( "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); +static cl::opt ForcePartialAliasingVectorization( + "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden, + cl::desc("Replace pointer diff checks with alias masks.")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -1386,6 +1392,47 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Returns true if all loop blocks should be masked. + bool allLoopBlocksMasked() const { + return foldTailByMasking() || maskPartialAliasing(); + } + + void checkIfPartialAliasMaskingIsEnabled() { + assert(!IsPartialAliasMaskingEnabled && + "Partial alias masking already checked!"); + if (!ForcePartialAliasingVectorization || !Legal->canMaskLoop() || + !Legal->getFixedOrderRecurrences().empty()) { + // Option not enabled (or loop cannot be masked). + // Note: FixedOrderRecurrences are not supported yet as we cannot handle + // the required `splice.right` with the alias-mask. + IsPartialAliasMaskingEnabled = false; + return; + } + const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking(); + if (!Checks) { + // Runtime checks not needed for this loop (no alias mask required). + IsPartialAliasMaskingEnabled = false; + return; + } + if (auto DiffChecks = Checks->getDiffChecks()) { + // We have diff checks. We can use an alias mask. + IsPartialAliasMaskingEnabled = !DiffChecks->empty(); + return; + } + // Runtime checks are not diff checks (can't be replaced with alias mask). + IsPartialAliasMaskingEnabled = false; + } + + void disablePartialAliasMaskingIfEnabled() { + if (IsPartialAliasMaskingEnabled) + IsPartialAliasMaskingEnabled = false; + } + + /// Returns true if all loop blocks should have partial aliases masked. + bool maskPartialAliasing() const { + return IsPartialAliasMaskingEnabled.value_or(false); + } + /// Returns true if the use of wide lane masks is requested and the loop is /// using tail-folding with a lane mask for control flow. bool useWideActiveLaneMask() const { @@ -1410,7 +1457,7 @@ class LoopVectorizationCostModel { /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { - return foldTailByMasking() || Legal->blockNeedsPredication(BB); + return allLoopBlocksMasked() || Legal->blockNeedsPredication(BB); } /// Returns true if VP intrinsics with explicit vector length support should @@ -1604,6 +1651,9 @@ class LoopVectorizationCostModel { std::optional> ChosenTailFoldingStyle; + /// true if partial alias masking is enabled (nullopt = undecided). + std::optional IsPartialAliasMaskingEnabled; + /// true if scalable vectorization is supported and enabled. std::optional IsScalableVectorizationAllowed; @@ -1825,14 +1875,18 @@ class GeneratedRTChecks { /// The kind of cost that we are calculating TTI::TargetCostKind CostKind; + /// True if the loop is alias-masked (which allows us to omit diff checks). + bool LoopUsesAliasMasking = false; + public: GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, - TTI::TargetCostKind CostKind) + TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking) : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false), MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false), - PSE(PSE), CostKind(CostKind) {} + PSE(PSE), CostKind(CostKind), + LoopUsesAliasMasking(LoopUsesAliasMasking) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -1885,7 +1939,7 @@ class GeneratedRTChecks { } const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); - if (RtPtrChecking.Need) { + if (RtPtrChecking.Need && !LoopUsesAliasMasking) { auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); @@ -2883,8 +2937,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { if (Legal->blockNeedsPredication(I->getParent())) return true; - // If we're not folding the tail by masking, predication is unnecessary. - if (!foldTailByMasking()) + // If we're not masking, predication is unnecessary. + if (!allLoopBlocksMasked()) return false; // All that remain are instructions with side-effects originally executed in @@ -3088,10 +3142,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( auto *Ptr = getLoadStorePointerOperand(I); auto *ScalarTy = getLoadStoreType(I); + int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr); // In order to be widened, the pointer should be consecutive, first of all. - if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) + if (!Stride) return false; + // Currently, we can't handle alias masking in reverse. Reversing the alias + // mask is not correct (or necessary). When combined with tail-folding the ALM + // should only be reversed where the alias-mask is true. + if (Stride < 0) + disablePartialAliasMaskingIfEnabled(); + // If the instruction is a store located in a predicated block, it will be // scalarized. if (isScalarWithPredication(I, VF)) @@ -3613,6 +3674,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } + checkIfPartialAliasMaskingIsEnabled(); + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false); @@ -4451,6 +4514,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } + if (CM.maskPartialAliasing()) { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization not supported with alias masking"); + return Result; + } + // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. if (!isCandidateForEpilogueVectorization(MainLoopVF)) { @@ -5729,7 +5799,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // stores. Note that even with tail folding we know that at least // one lane is active (i.e. generalized predication is not possible // here), and the logic below depends on this fact. - if (!foldTailByMasking()) + if (!allLoopBlocksMasked()) return true; // For scalable vectors, a uniform memop load is always @@ -6824,8 +6894,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.invalidateCostModelingDecisions(); } - if (CM.foldTailByMasking()) - Legal->prepareToFoldTailByMasking(); + if (CM.allLoopBlocksMasked()) + Legal->prepareToMaskLoop(); ElementCount MaxUserVF = UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; @@ -6937,7 +7007,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, // TODO: Remove this code after stepping away from the legacy cost model and // adding code to simplify VPlans before calculating their costs. auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop); - if (TC == VF && !CM.foldTailByMasking()) + if (TC == VF && !CM.allLoopBlocksMasked()) addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), CostCtx.SkipCostComputation); @@ -7431,6 +7501,14 @@ DenseMap LoopVectorizationPlanner::executePlan( // compactness. attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); + VPValue *ClampedVF = nullptr; + if (CM.maskPartialAliasing()) { + ClampedVF = materializeAliasMask( + BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(), + HasBranchWeights); + ++LoopsPartialAliasVectorized; + } + // Retrieving VectorPH now when it's easier while VPlan still has Regions. VPBasicBlock *VectorPH = cast(BestVPlan.getVectorPreheader()); @@ -7467,6 +7545,7 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::materializeVectorTripCount( BestVPlan, VectorPH, CM.foldTailByMasking(), CM.requiresScalarEpilogue(BestVF.isVector())); + VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF); VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF); VPlanTransforms::cse(BestVPlan); VPlanTransforms::simplifyRecipes(BestVPlan); @@ -8237,7 +8316,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Predicate and linearize the top-level loop region. // --------------------------------------------------------------------------- RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize, *Plan, - CM.foldTailByMasking()); + CM.foldTailByMasking(), CM.maskPartialAliasing()); // --------------------------------------------------------------------------- // Construct wide recipes and apply predication for original scalar @@ -8483,9 +8562,9 @@ void LoopVectorizationPlanner::addReductionResultComputation( // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. auto *RR = dyn_cast(OrigExitingVPV->getDefiningRecipe()); - if (!PhiR->isInLoop() && CM.foldTailByMasking() && + if (!PhiR->isInLoop() && CM.allLoopBlocksMasked() && (!RR || !RR->isPartialReduction())) { - VPValue *Cond = vputils::findHeaderMask(*Plan); + VPValue *Cond = vputils::findLoopBodyMask(*Plan); VPIRFlags Flags = PhiTy->isFloatingPointTy() ? VPIRFlags(RdxDesc.getFastMathFlags()) : VPIRFlags(); @@ -8688,6 +8767,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks( } } +VPValue *LoopVectorizationPlanner::materializeAliasMask( + VPlan &Plan, ArrayRef DiffChecks, bool HasBranchWeights) { + VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check"); + VPValue *ClampedVF = VPlanTransforms::materializeAliasMask( + Plan, MinVFCheck, + *CM.Legal->getRuntimePointerChecking()->getDiffChecks()); + VPBuilder Builder(MinVFCheck); + Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); + // Check the "ClampedVF" from the alias mask contains at least two elements. + VPValue *Cond = Builder.createICmp( + CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf"); + VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights); + return ClampedVF; +} + void LoopVectorizationPlanner::addMinimumIterationCheck( VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const { @@ -8800,7 +8894,8 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getPlanFor(VF.Width); { - GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind, + CM.maskPartialAliasing()); InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -9649,7 +9744,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (ORE->allowExtraAnalysis(LV_NAME)) LVP.emitInvalidCostRemarks(ORE); - GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind, + CM.maskPartialAliasing()); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); @@ -9768,6 +9864,17 @@ bool LoopVectorizePass::processLoop(Loop *L) { IC = 1; } + if (CM.maskPartialAliasing()) { + LLVM_DEBUG( + dbgs() + << "LV: Not interleaving due to partial aliasing vectorization.\n"); + IntDiagMsg = { + "PartialAliasingVectorization", + "Unable to interleave due to partial aliasing vectorization."}; + InterleaveLoop = false; + IC = 1; + } + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f233f0dc1b025..5e4fc7dbcb1f5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1073,6 +1073,12 @@ void VPlan::printLiveIns(raw_ostream &O) const { O << " = vector-trip-count"; } + if (AliasMask.getNumUsers() > 0) { + O << "\nLive-in "; + AliasMask.printAsOperand(O, SlotTracker); + O << " = alias-mask"; + } + if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { O << "\nLive-in "; BackedgeTakenCount->printAsOperand(O, SlotTracker); @@ -1203,6 +1209,7 @@ VPlan *VPlan::duplicate() { Old2NewVPValues[&VF] = &NewPlan->VF; Old2NewVPValues[&UF] = &NewPlan->UF; Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF; + Old2NewVPValues[&AliasMask] = &NewPlan->AliasMask; if (BackedgeTakenCount) { NewPlan->BackedgeTakenCount = new VPSymbolicValue(); Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount; @@ -1496,6 +1503,8 @@ void VPSlotTracker::assignNames(const VPlan &Plan) { if (Plan.VFxUF.getNumUsers() > 0) assignName(&Plan.VFxUF); assignName(&Plan.VectorTripCount); + if (Plan.AliasMask.getNumUsers() > 0) + assignName(&Plan.AliasMask); if (Plan.BackedgeTakenCount) assignName(Plan.BackedgeTakenCount); for (VPValue *LI : Plan.getLiveIns()) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e05da74125d1c..076cfae8ad323 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1209,8 +1209,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, // part if it is scalar. In the latter case, the recipe will be removed // during unrolling. ExtractPenultimateElement, - LogicalAnd, // Non-poison propagating logical And. - LogicalOr, // Non-poison propagating logical Or. + LogicalAnd, // Non-poison propagating logical And. + LogicalOr, // Non-poison propagating logical Or. + NumActiveLanes, // Counts the number of active lanes in a mask. // Add an offset in bytes (second operand) to a base pointer (first // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). @@ -4535,6 +4536,9 @@ class VPlan { /// Represents the loop-invariant VF * UF of the vector loop region. VPSymbolicValue VFxUF; + /// Represents the loop-invariant alias of the vector loop region. + VPSymbolicValue AliasMask; + /// Contains all the external definitions created for this VPlan, as a mapping /// from IR Values to VPIRValues. SmallMapVector LiveIns; @@ -4677,6 +4681,10 @@ class VPlan { /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } + /// Returns alias mask of the vector loop region. + VPValue &getAliasMask() { return AliasMask; } + const VPValue &getAliasMask() const { return AliasMask; } + LLVMContext &getContext() const { return getScalarHeader()->getIRBasicBlock()->getContext(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 4b744b9128171..91b1889ed810c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -24,7 +24,8 @@ using namespace VPlanPatternMatch; #define DEBUG_TYPE "vplan" -VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) { +VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) + : Ctx(Plan.getContext()), Plan(Plan) { if (auto LoopRegion = Plan.getVectorLoopRegion()) { if (const auto *CanIV = dyn_cast( &LoopRegion->getEntryBasicBlock()->front())) { @@ -147,6 +148,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return inferScalarType(R->getOperand(0)); case Instruction::ExtractValue: return cast(R->getUnderlyingValue())->getType(); + case VPInstruction::NumActiveLanes: + return Type::getInt64Ty(Ctx); default: break; } @@ -277,8 +280,12 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { return IRV->getType(); if (isa(V)) { - // All VPValues without any underlying IR value (like the vector trip count - // or the backedge-taken count) have the same type as the canonical IV. + if (V == &Plan.getAliasMask()) + return IntegerType::getInt1Ty(Ctx); + + // All other VPValues without any underlying IR value (like the vector trip + // count or the backedge-taken count) have the same type as the canonical + // IV. return CanonicalIVTy; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index dc4be4270f7f1..c268a7f22e339 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -46,6 +46,7 @@ class VPTypeAnalysis { /// count). Type *CanonicalIVTy; LLVMContext &Ctx; + const VPlan &Plan; Type *inferScalarTypeForRecipe(const VPBlendRecipe *R); Type *inferScalarTypeForRecipe(const VPInstruction *R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index e56691b1c960e..28b6b353d41d4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -1027,13 +1027,19 @@ static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB, } } +void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *Cond, + VPBasicBlock *CheckBlock, + bool AddBranchWeights) { + insertCheckBlockBeforeVectorLoop(Plan, CheckBlock); + addBypassBranch(Plan, CheckBlock, Cond, AddBranchWeights); +} + void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights) { VPValue *CondVPV = Plan.getOrAddLiveIn(Cond); VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock); - insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB); - addBypassBranch(Plan, CheckBlockVPBB, CondVPV, AddBranchWeights); + attachCheckBlock(Plan, CondVPV, CheckBlockVPBB, AddBranchWeights); } void VPlanTransforms::addMinimumIterationCheck( diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index dbc2e71c785ee..38a6eaf00db1b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -74,7 +74,8 @@ class VPPredicator { } /// Compute and return the mask for the vector loop header block. - void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail); + void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail, + bool MaskAliasing); /// Compute the predicate of \p VPBB, assuming that the header block of the /// loop is set to True, or to the loop mask when tail folding. @@ -153,25 +154,38 @@ void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) { setBlockInMask(VPBB, BlockMask); } -void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { - if (!FoldTail) { +void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail, + bool MaskAliasing) { + if (!FoldTail && !MaskAliasing) { setBlockInMask(HeaderVPBB, nullptr); return; } - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - + VPValue *BlockMask = nullptr; auto &Plan = *HeaderVPBB->getPlan(); - auto *IV = - new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV()); - Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - Builder.insert(IV); - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + if (FoldTail) { + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + auto *IV = + new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV()); + Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + Builder.insert(IV); + + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + } + + if (MaskAliasing) { + if (BlockMask) + BlockMask = Builder.createAnd(BlockMask, &Plan.getAliasMask()); + else + BlockMask = &Plan.getAliasMask(); + } + setBlockInMask(HeaderVPBB, BlockMask); } @@ -265,7 +279,8 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { } } -void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { +void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail, + bool MaskAliasing) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -280,7 +295,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { // convert all phi recipes of VPBB to blend recipes unless VPBB is the // header. if (VPBB == Header) { - Predicator.createHeaderMask(Header, FoldTail); + Predicator.createHeaderMask(Header, FoldTail, MaskAliasing); } else { Predicator.createBlockInMask(VPBB); Predicator.convertPhisToBlends(VPBB); @@ -314,11 +329,11 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { PrevVPBB = VPBB; } - // If we folded the tail and introduced a header mask, any extract of the - // last element must be updated to extract from the last active lane of the - // header mask instead (i.e., the lane corresponding to the last active - // iteration). - if (FoldTail) { + // If we folded the and introduced a header mask, or have partial alias + // masking, any extract of the last element must be updated to extract from + // the last active lane of the header mask instead (i.e., the lane + // corresponding to the last active iteration). + if (FoldTail || MaskAliasing) { assert(Plan.getExitBlocks().size() == 1 && "only a single-exit block is supported currently"); assert(Plan.getExitBlocks().front()->getSinglePredecessor() == diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d31545ebe720d..3dbb010935f97 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -458,6 +458,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const { case VPInstruction::ResumeForEpilogue: case VPInstruction::Reverse: case VPInstruction::Unpack: + case VPInstruction::NumActiveLanes: return 1; case Instruction::ICmp: case Instruction::FCmp: @@ -608,6 +609,20 @@ Value *VPInstruction::generate(VPTransformState &State) { {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); } + case VPInstruction::NumActiveLanes: { + Value *Op = State.get(getOperand(0)); + auto *VecTy = cast(Op->getType()); + assert(VecTy->getScalarSizeInBits() == 1 && + "NumActiveLanes only implemented for i1 vectors"); + + Value *ZExt = Builder.CreateCast( + Instruction::ZExt, Op, + VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount())); + Value *Count = + Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt); + return Builder.CreateCast(Instruction::ZExt, Count, Builder.getInt64Ty(), + "num.active.lanes"); + } case VPInstruction::FirstOrderRecurrenceSplice: { // Generate code to combine the previous and current values in vector v3. // @@ -1271,7 +1286,8 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ExtractLastActive || getOpcode() == VPInstruction::ComputeReductionResult || - getOpcode() == VPInstruction::AnyOf; + getOpcode() == VPInstruction::AnyOf || + getOpcode() == VPInstruction::NumActiveLanes; } bool VPInstruction::isSingleScalar() const { @@ -1545,6 +1561,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, case VPInstruction::ExtractLastActive: O << "extract-last-active"; break; + case VPInstruction::NumActiveLanes: + O << "num-active-lanes"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index bb1a91ec8c963..967060346c6b7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -5112,6 +5112,60 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, "VF, UF, and VFxUF not expected to be used"); } +VPValue * +VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck, + ArrayRef DiffChecks) { + VPValue &AliasMask = Plan.getAliasMask(); + VPBuilder Builder(AliasCheck, AliasCheck->begin()); + Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext()); + Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext()); + Type *PtrTy = PointerType::getUnqual(Plan.getContext()); + + VPValue *Mask = nullptr; + for (PointerDiffInfo Check : DiffChecks) { + VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart); + VPValue *Sink = + vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart); + + VPValue *SrcPtr = + Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, PtrTy, + DebugLoc::getCompilerGenerated()); + VPValue *SinkPtr = + Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, PtrTy, + DebugLoc::getCompilerGenerated()); + + VPWidenIntrinsicRecipe *WARMask = new VPWidenIntrinsicRecipe( + Intrinsic::loop_dependence_war_mask, + {SrcPtr, SinkPtr, Plan.getConstantInt(I64Ty, Check.AccessSize)}, I1Ty); + Builder.insert(WARMask); + + if (Mask) + Mask = Builder.createAnd(Mask, WARMask); + else + Mask = WARMask; + } + + // Replace all users of the symbolic alias-mask with the materialized value. + AliasMask.replaceAllUsesWith(Mask); + + Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); + VPValue *NumActive = + Builder.createNaryOp(VPInstruction::NumActiveLanes, {Mask}); + return Builder.createScalarZExtOrTrunc(NumActive, IVTy, I64Ty, + DebugLoc::getCompilerGenerated()); +} + +void VPlanTransforms::fixupVFUsersForClampedVF(VPlan &Plan, + VPValue *ClampedVF) { + if (!ClampedVF) + return; + + assert(Plan.getConcreteUF() == 1 && + "Clamped VF not support with interleaving"); + Plan.getVF().replaceAllUsesWith(ClampedVF); + Plan.getVFxUF().replaceAllUsesWith(ClampedVF); +} + DenseMap VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 972a18ebded63..0b69fd7f02f19 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -178,6 +178,8 @@ struct VPlanTransforms { /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a /// VPValue and connect the block to \p Plan, using the VPValue as branch /// condition. + static void attachCheckBlock(VPlan &Plan, VPValue *Cond, + VPBasicBlock *CheckBlock, bool AddBranchWeights); static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights); @@ -422,6 +424,12 @@ struct VPlanTransforms { static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF); + static VPValue *materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck, + ArrayRef DiffChecks); + + /// Replaces all users of the VF and VFxUF with the runtime clamped VF. + static void fixupVFUsersForClampedVF(VPlan &Plan, VPValue *ClampedVF); + /// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each /// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR /// value. A mapping from SCEV expressions to their expanded IR value is @@ -447,7 +455,8 @@ struct VPlanTransforms { /// Predicate and linearize the control-flow in the only loop region of /// \p Plan. If \p FoldTail is true, create a mask guarding the loop /// header, otherwise use all-true for the header mask. - static void introduceMasksAndLinearize(VPlan &Plan, bool FoldTail); + static void introduceMasksAndLinearize(VPlan &Plan, bool FoldTail, + bool MaskAliasing); /// Add branch weight metadata, if the \p Plan's middle block is terminated by /// a BranchOnCond recipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index f5318bb1c6515..985c0254ca148 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -46,11 +46,20 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { if (U && !isa(U->getValue())) return Plan.getOrAddLiveIn(U->getValue()); auto *Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded); + VPBasicBlock *EntryVPBB = Plan.getEntry(); + Plan.getEntry()->insert(Expanded, EntryVPBB->getFirstNonPhi()); return Expanded; } bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { + if (V == &Plan.getAliasMask()) + return true; + + VPValue *Mask; + if (match(V, + m_c_BinaryAnd(m_VPValue(Mask), m_Specific(&Plan.getAliasMask())))) + V = Mask; + if (isa(V)) return true; @@ -606,9 +615,30 @@ VPSingleDefRecipe *vputils::findHeaderMask(VPlan &Plan) { HeaderMask = VPI; } } + return HeaderMask; } +VPValue *vputils::findLoopBodyMask(VPlan &Plan) { + VPValue *LoopMask = findHeaderMask(Plan); + + // If an alias-mask is in use, ensure that it included in the loop mask. + VPValue *AliasMask = &Plan.getAliasMask(); + if (AliasMask->getNumUsers() > 0) { + if (LoopMask) { + assert(AliasMask->hasOneUse() && + "expected one use (`loop-mask = and alias-mask, lane-mask`)"); + auto *VPI = dyn_cast(AliasMask->getSingleUser()); + if (vputils::isHeaderMask(VPI, Plan)) + LoopMask = VPI; + } else { + LoopMask = AliasMask; + } + } + + return LoopMask; +} + bool VPBlockUtils::isHeader(const VPBlockBase *VPB, const VPDominatorTree &VPDT) { auto *VPBB = dyn_cast(VPB); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index a5692699d9d76..c087619f2b02d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -151,6 +151,10 @@ VPInstruction *findComputeReductionResult(VPReductionPHIRecipe *PhiR); /// the header-mask pattern manually. VPSingleDefRecipe *findHeaderMask(VPlan &Plan); +/// Finds the mask for the loop body. This differs from `findHeaderMask` as it +/// will include the alias-mask (if present). +VPValue *findLoopBodyMask(VPlan &Plan); + } // namespace vputils //===----------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll new file mode 100644 index 0000000000000..fca77657d051e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll @@ -0,0 +1,847 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize %s | FileCheck %s --check-prefix=CHECK-TF + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define void @alias_mask( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[C2]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP12]], ptr [[TMP9]], i64 1) +; CHECK-NEXT: [[TMP8:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP8]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2 +; CHECK-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP7]], [[ALIAS_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP13]], [[ALIAS_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP14:%.*]] = select [[ALIAS_LANE_MASK]], [[WIDE_MASKED_LOAD]], splat (i8 1) +; CHECK-NEXT: [[TMP10:%.*]] = sdiv [[WIDE_MASKED_LOAD3]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP10]], ptr align 1 [[TMP15]], [[ALIAS_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +; CHECK-TF-LABEL: define void @alias_mask( +; CHECK-TF-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-TF-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK-TF: [[FOR_BODY_PREHEADER]]: +; CHECK-TF-NEXT: br label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP1:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-TF-NEXT: [[TMP2:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP1]], ptr [[TMP2]], i64 1) +; CHECK-TF-NEXT: [[TMP4:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-TF-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP4]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-TF-NEXT: [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2 +; CHECK-TF-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP10:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = select [[TMP10]], [[WIDE_MASKED_LOAD]], splat (i8 1) +; CHECK-TF-NEXT: [[TMP14:%.*]] = sdiv [[WIDE_MASKED_LOAD3]], [[TMP13]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP14]], ptr align 1 [[TMP15]], [[TMP10]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +; CHECK-TF-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %div = sdiv i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %div, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + +; Note: This test could emit a `llvm.loop.dependence.raw` mask to avoid creating +; a dependency between the store and the load, but it is not necessary for +; correctness. +define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define i32 @alias_mask_read_after_write( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP19]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[C2]] to ptr +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[B1]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP9]], ptr [[TMP4]], i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP5]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2 +; CHECK-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH1:.*]] +; CHECK: [[VECTOR_PH1]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP7]], [[ALIAS_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP8]], [[ALIAS_LANE_MASK]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP15]], [[ALIAS_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD3]] +; CHECK-NEXT: [[TMP12]] = select [[ALIAS_LANE_MASK]], [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +; CHECK-TF-LABEL: define i32 @alias_mask_read_after_write( +; CHECK-TF-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-TF-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP19]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK-TF: [[FOR_BODY_PREHEADER]]: +; CHECK-TF-NEXT: br label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C2]] to ptr +; CHECK-TF-NEXT: [[TMP2:%.*]] = inttoptr i64 [[B1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP1]], ptr [[TMP2]], i64 4) +; CHECK-TF-NEXT: [[TMP4:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-TF-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP4]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-TF-NEXT: [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2 +; CHECK-TF-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP10:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP11]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP12]], [[TMP10]]) +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP13]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP14:%.*]] = add [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = add [[TMP14]], [[WIDE_MASKED_LOAD3]] +; CHECK-TF-NEXT: [[TMP16]] = select [[TMP10]], [[TMP15]], [[VEC_PHI]] +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-TF-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; CHECK-TF-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + + +entry: + %cmp19 = icmp sgt i64 %n, 0 + br i1 %cmp19, label %for.body, label %exit + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ] + %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv + %load.a = load i32, ptr %gep.a, align 2 + %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv + store i32 %load.a, ptr %gep.c, align 2 + %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv + %load.b = load i32, ptr %gep.b, align 2 + %add = add i32 %load.a, %accum + %add2 = add i32 %add, %load.b + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %entry, %for.body + %result = phi i32 [ 0, %entry ], [ %add2, %for.body ] + ret i32 %result +} + +define void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define void @alias_mask_multiple( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A6:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-NEXT: [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[A6]] to ptr +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK0:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP9]], ptr [[TMP10]], i64 1) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK1:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP13]], ptr [[TMP14]], i64 1) +; CHECK-NEXT: [[TMP15:%.*]] = and [[ALIAS_LANE_MASK0]], [[ALIAS_LANE_MASK1]] +; CHECK-NEXT: [[TMP16:%.*]] = zext [[TMP15]] to +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP16]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2 +; CHECK-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[TMP15]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], [[TMP15]], poison) +; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP18]], ptr align 1 [[TMP19]], [[TMP15]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +; CHECK-TF-LABEL: define void @alias_mask_multiple( +; CHECK-TF-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[A7:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-TF-NEXT: [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-TF-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK-TF: [[FOR_BODY_PREHEADER]]: +; CHECK-TF-NEXT: br label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP2:%.*]] = inttoptr i64 [[A7]] to ptr +; CHECK-TF-NEXT: [[TMP3:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_LANE_MASK0:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP2]], ptr [[TMP3]], i64 1) +; CHECK-TF-NEXT: [[TMP5:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-TF-NEXT: [[TMP6:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_LANE_MASK1:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP5]], ptr [[TMP6]], i64 1) +; CHECK-TF-NEXT: [[TMP8:%.*]] = and [[ALIAS_LANE_MASK0]], [[ALIAS_LANE_MASK1]] +; CHECK-TF-NEXT: [[TMP7:%.*]] = zext [[TMP8]] to +; CHECK-TF-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP7]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-TF-NEXT: [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2 +; CHECK-TF-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP14:%.*]] = sub i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP17:%.*]] = and [[ACTIVE_LANE_MASK]], [[TMP8]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], [[TMP17]], poison) +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP19]], [[TMP17]], poison) +; CHECK-TF-NEXT: [[TMP20:%.*]] = add [[WIDE_MASKED_LOAD8]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP20]], ptr align 1 [[TMP21]], [[TMP17]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP16]]) +; CHECK-TF-NEXT: [[TMP22:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP23:%.*]] = xor i1 [[TMP22]], true +; CHECK-TF-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %add = add i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %add, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + +; Checks using a scalar outside the loop, with requires extracting the last +; active element. +define i8 @alias_masking_exit_value(ptr %ptrA, ptr %ptrB) { +; CHECK-LABEL: define i8 @alias_masking_exit_value( +; CHECK-SAME: ptr [[PTRA:%.*]], ptr [[PTRB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PTRA2:%.*]] = ptrtoaddr ptr [[PTRA]] to i64 +; CHECK-NEXT: [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[PTRA2]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[PTRB1]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP2]], ptr [[TMP3]], i64 1) +; CHECK-NEXT: [[TMP5:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP5]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[NUM_ACTIVE_LANES]] to i32 +; CHECK-NEXT: [[CMP_VF:%.*]] = icmp ult i32 [[TMP7]], 2 +; CHECK-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP7]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv16i8() +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP7]] to i8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP8]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], [[ALIAS_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = add [[VEC_IND]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP12]], ptr align 1 [[TMP11]], [[ALIAS_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, [[N_VEC]] +; CHECK-NEXT: [[TMP14:%.*]] = xor [[ALIAS_LANE_MASK]], splat (i1 true) +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP14]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP12]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +; CHECK-TF-LABEL: define i8 @alias_masking_exit_value( +; CHECK-TF-SAME: ptr [[PTRA:%.*]], ptr [[PTRB:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[PTRA2:%.*]] = ptrtoaddr ptr [[PTRA]] to i64 +; CHECK-TF-NEXT: [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64 +; CHECK-TF-NEXT: br label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP0:%.*]] = inttoptr i64 [[PTRA2]] to ptr +; CHECK-TF-NEXT: [[TMP1:%.*]] = inttoptr i64 [[PTRB1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-TF-NEXT: [[TMP3:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP3]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-TF-NEXT: [[TMP5:%.*]] = trunc i64 [[NUM_ACTIVE_LANES]] to i32 +; CHECK-TF-NEXT: [[CMP_VF:%.*]] = icmp ult i32 [[TMP5]], 2 +; CHECK-TF-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP8:%.*]] = sub i32 1000, [[TMP5]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = icmp ugt i32 1000, [[TMP5]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1000) +; CHECK-TF-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv16i8() +; CHECK-TF-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP5]] to i8 +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[TMP12]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP11]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP13:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i32 [[INDEX]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i32 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[TMP13]], poison) +; CHECK-TF-NEXT: [[TMP16:%.*]] = add [[VEC_IND]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP16]], ptr align 1 [[TMP15]], [[TMP13]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP5]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP10]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-TF-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-TF-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: [[TMP19:%.*]] = xor [[TMP13]], splat (i1 true) +; CHECK-TF-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP19]], i1 false) +; CHECK-TF-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-TF-NEXT: [[TMP20:%.*]] = extractelement [[TMP16]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-TF-NEXT: br [[EXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gepA = getelementptr inbounds i8, ptr %ptrA, i32 %iv + %gepB = getelementptr inbounds i8, ptr %ptrB, i32 %iv + %loadA = load i8, ptr %gepA + %iv.trunc = trunc i32 %iv to i8 + %add = add i8 %iv.trunc, %loadA + store i8 %add, ptr %gepB + %iv.next = add nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + %exit.value = phi i8 [ %add, %loop ] + ret i8 %exit.value +} + +; Unsupported: Reversing the alias mask is not correct. +define void @alias_mask_reverse_iterate(ptr noalias %ptrA, ptr %ptrB, ptr %ptrC, i64 %n) { +; CHECK-LABEL: define void @alias_mask_reverse_iterate( +; CHECK-SAME: ptr noalias [[PTRA:%.*]], ptr [[PTRB:%.*]], ptr [[PTRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[PTRC2:%.*]] = ptrtoaddr ptr [[PTRC]] to i64 +; CHECK-NEXT: [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64 +; CHECK-NEXT: [[IV_START:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[IV_START]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[PTRB1]], [[PTRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 5 +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[IV_START]], [[TMP5]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[IV_START]], [[TMP8]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[IV_START]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[IV_START]], [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sub nuw nsw i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], -1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 -1, [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP15]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP16]], align 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_LOAD]]) +; CHECK-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_LOAD4]]) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 [[TMP15]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_LOAD6]]) +; CHECK-NEXT: [[REVERSE9:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_LOAD7]]) +; CHECK-NEXT: [[TMP20:%.*]] = add [[REVERSE8]], [[REVERSE]] +; CHECK-NEXT: [[TMP21:%.*]] = add [[REVERSE9]], [[REVERSE5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i64 [[TMP15]] +; CHECK-NEXT: [[REVERSE10:%.*]] = call @llvm.vector.reverse.nxv16i8( [[TMP20]]) +; CHECK-NEXT: [[REVERSE11:%.*]] = call @llvm.vector.reverse.nxv16i8( [[TMP21]]) +; CHECK-NEXT: store [[REVERSE10]], ptr [[TMP23]], align 1 +; CHECK-NEXT: store [[REVERSE11]], ptr [[TMP24]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[IV_START]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[IV_START]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF12:%.*]] = urem i64 [[IV_START]], 8 +; CHECK-NEXT: [[N_VEC13:%.*]] = sub i64 [[IV_START]], [[N_MOD_VF12]] +; CHECK-NEXT: [[TMP26:%.*]] = sub i64 [[IV_START]], [[N_VEC13]] +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[IV_START]], [[INDEX14]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 -7 +; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i8>, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD15]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i64 -7 +; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i8>, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD17]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = add <8 x i8> [[REVERSE18]], [[REVERSE16]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 -7 +; CHECK-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x i8> [[TMP31]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: store <8 x i8> [[REVERSE19]], ptr [[TMP33]], align 1 +; CHECK-NEXT: [[INDEX_NEXT20]] = add nuw i64 [[INDEX14]], 8 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT20]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N21:%.*]] = icmp eq i64 [[IV_START]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[CMP_N21]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP26]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], %[[VECTOR_MEMCHECK]] ], [ [[IV_START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[IV]] +; CHECK-NEXT: [[LOADA:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[IV]] +; CHECK-NEXT: [[LOADB:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOADB]], [[LOADA]] +; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[IV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[GEP_C]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +; CHECK-TF-LABEL: define void @alias_mask_reverse_iterate( +; CHECK-TF-SAME: ptr noalias [[PTRA:%.*]], ptr [[PTRB:%.*]], ptr [[PTRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[PTRC2:%.*]] = ptrtoaddr ptr [[PTRC]] to i64 +; CHECK-TF-NEXT: [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64 +; CHECK-TF-NEXT: [[IV_START:%.*]] = add i64 [[N]], -1 +; CHECK-TF-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK-TF: [[VECTOR_MEMCHECK]]: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-TF-NEXT: [[TMP2:%.*]] = sub i64 [[PTRB1]], [[PTRC2]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4 +; CHECK-TF-NEXT: [[TMP7:%.*]] = sub i64 [[IV_START]], [[TMP6]] +; CHECK-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[IV_START]], [[TMP6]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[IV_START]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[IV_START]], [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[OFFSET_IDX]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = sub nuw nsw i64 [[TMP6]], 1 +; CHECK-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], -1 +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP12]] +; CHECK-TF-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv16i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP13]], [[REVERSE]], poison) +; CHECK-TF-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_MASKED_LOAD]]) +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[OFFSET_IDX]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP12]] +; CHECK-TF-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv16i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP15]], [[REVERSE4]], poison) +; CHECK-TF-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_MASKED_LOAD5]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = add [[REVERSE6]], [[REVERSE3]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[OFFSET_IDX]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP12]] +; CHECK-TF-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv16i8( [[TMP16]]) +; CHECK-TF-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv16i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[REVERSE7]], ptr align 1 [[TMP18]], [[REVERSE8]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-TF-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true +; CHECK-TF-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: br [[EXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; +entry: + %iv.start = add nsw i64 %n, -1 + br label %loop + +loop: + %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds i8, ptr %ptrA, i64 %iv + %loadA = load i8, ptr %gep.A, align 1 + %gep.B = getelementptr inbounds i8, ptr %ptrB, i64 %iv + %loadB = load i8, ptr %gep.B, align 1 + %add = add i8 %loadB, %loadA + %gep.C = getelementptr inbounds i8, ptr %ptrC, i64 %iv + store i8 %add, ptr %gep.C, align 1 + %iv.next = add nsw i64 %iv, -1 + %ec = icmp eq i64 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test taken from: scalable-first-order-recurrence.ll. Check we don't use +; an alias-mask with first-order recurrences, as we cannot handle the +; splice.right with the alias-mask/clamped VF yet. +define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) { +; CHECK-LABEL: define i32 @recurrence_1( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: br label %[[FOR_PREHEADER:.*]] +; CHECK: [[FOR_PREHEADER]]: +; CHECK-NEXT: [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3 +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP4]], i64 12) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[UMAX]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[B1]], -4 +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP7]] +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP12]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP14:%.*]] = mul nuw i32 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = sub i32 [[TMP14]], 1 +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 [[PRE_LOAD]], i32 [[TMP15]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP11]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 4 +; CHECK-NEXT: [[WIDE_LOAD3]] = load , ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.vector.splice.right.nxv4i32( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 1) +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.right.nxv4i32( [[WIDE_LOAD]], [[WIDE_LOAD3]], i32 1) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = add [[WIDE_LOAD]], [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = add [[WIDE_LOAD3]], [[TMP20]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP11]] +; CHECK-NEXT: store [[TMP22]], ptr [[TMP21]], align 4 +; CHECK-NEXT: store [[TMP23]], ptr [[TMP24]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP27:%.*]] = mul nuw i32 [[TMP26]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP27]], 1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD3]], i32 [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 4 +; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement [[WIDE_LOAD3]], i32 [[TMP31]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +; CHECK-TF-LABEL: define i32 @recurrence_1( +; CHECK-TF-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-TF-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: br label %[[FOR_PREHEADER:.*]] +; CHECK-TF: [[FOR_PREHEADER]]: +; CHECK-TF-NEXT: [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-TF-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-TF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-TF-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK-TF: [[VECTOR_MEMCHECK]]: +; CHECK-TF-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; CHECK-TF-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-TF-NEXT: [[TMP6:%.*]] = add i64 [[B1]], -4 +; CHECK-TF-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], [[A2]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP5]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[TMP12:%.*]] = sub i64 [[TMP2]], [[TMP11]] +; CHECK-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP2]], [[TMP11]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP16:%.*]] = mul nuw i32 [[TMP15]], 4 +; CHECK-TF-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1 +; CHECK-TF-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 [[PRE_LOAD]], i32 [[TMP17]] +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_MASKED_LOAD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP18:%.*]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP19]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.right.nxv4i32( [[VECTOR_RECUR]], [[WIDE_MASKED_LOAD]], i32 1) +; CHECK-TF-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP22:%.*]] = add [[WIDE_MASKED_LOAD]], [[TMP20]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP22]], ptr align 4 [[TMP21]], [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]]) +; CHECK-TF-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP24:%.*]] = xor i1 [[TMP23]], true +; CHECK-TF-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: [[TMP25:%.*]] = xor [[ACTIVE_LANE_MASK]], splat (i1 true) +; CHECK-TF-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( [[TMP25]], i1 false) +; CHECK-TF-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-TF-NEXT: [[TMP26:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], 1 +; CHECK-TF-NEXT: [[TMP27:%.*]] = extractelement [[WIDE_MASKED_LOAD]], i64 [[TMP26]] +; CHECK-TF-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP29:%.*]] = mul nuw i32 [[TMP28]], 4 +; CHECK-TF-NEXT: [[TMP30:%.*]] = sub i32 [[TMP29]], 1 +; CHECK-TF-NEXT: [[TMP31:%.*]] = extractelement [[VECTOR_RECUR]], i32 [[TMP30]] +; CHECK-TF-NEXT: [[TMP32:%.*]] = icmp eq i64 [[LAST_ACTIVE_LANE]], 0 +; CHECK-TF-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP27]] +; CHECK-TF-NEXT: br [[FOR_EXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + +entry: + br label %for.preheader + +for.preheader: + %pre_load = load i32, ptr %a + br label %scalar.body + +scalar.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.next + %1 = load i32, ptr %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, ptr %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %scalar.body + +for.exit: + ret i32 %0 +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[PROF11]] = !{!"branch_weights", i32 8, i32 24} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +;. +; CHECK-TF: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-TF: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-TF: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-TF: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-TF: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-TF: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-TF: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-TF: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll new file mode 100644 index 0000000000000..0052176a3d95b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll @@ -0,0 +1,289 @@ +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -debug-only=loop-vectorize -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -debug-only=loop-vectorize -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize -disable-output %s 2>&1 | FileCheck %s --check-prefix=CHECK-TF + + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: 'alias_mask' +; CHECK: VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.]]> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[ALIAS_MASK:%.]]> = alias-mask +; CHECK-NEXT: vp<[[TC:%.]]> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) +; CHECK-NEXT: IR %wide.trip.count = zext nneg i32 %n to i64 +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer inbounds ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%0> = load vp<[[PTR_A]]>, vp<[[ALIAS_MASK]]> +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer inbounds ir<%arrayidx2> +; CHECK-NEXT: WIDEN ir<%1> = load vp<[[PTR_B]]>, vp<[[ALIAS_MASK]]> +; CHECK-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; CHECK-NEXT: CLONE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_C:%.+]]> = vector-pointer inbounds ir<%arrayidx6> +; CHECK-NEXT: WIDEN store vp<[[PTR_C]]>, ir<%add>, vp<[[ALIAS_MASK]]> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VEC_TC]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv +; CHECK-NEXT: IR %0 = load i8, ptr %arrayidx, align 1 +; CHECK-NEXT: IR %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv +; CHECK-NEXT: IR %1 = load i8, ptr %arrayidx2, align 1 +; CHECK-NEXT: IR %add = add i8 %1, %0 +; CHECK-NEXT: IR %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv +; CHECK-NEXT: IR store i8 %add, ptr %arrayidx6, align 1 +; CHECK-NEXT: IR %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +; CHECK: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4,vscale x 8,vscale x 16},UF={1}' { +; CHECK-NEXT: Live-in ir<%wide.trip.count> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %wide.trip.count = zext nneg i32 %n to i64 +; CHECK-NEXT: IR [[VSCALE:%.+]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: IR [[MINTC:%.+]] = shl nuw i64 [[VSCALE]], 4 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%wide.trip.count>, ir<[[MINTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb, vector.min.vf.check +; CHECK-EMPTY: +; CHECK-NEXT: vector.min.vf.check: +; CHECK-NEXT: EMIT-SCALAR vp<[[PTR_B:%.+]]> = inttoptr ir<%b2> to ptr +; CHECK-NEXT: EMIT-SCALAR vp<[[PTR_C:%.+]]> = inttoptr ir<%c1> to ptr +; CHECK-NEXT: WIDEN-INTRINSIC vp<[[ALIAS_MASK:%.+]]> = call llvm.loop.dependence.war.mask(vp<[[PTR_B]]>, vp<[[PTR_C]]>, ir<1>) +; CHECK-NEXT: EMIT vp<[[CLAMPED_VF:%.+]]> = num-active-lanes vp<[[ALIAS_MASK]]> +; CHECK-NEXT: EMIT vp<%cmp.vf> = icmp ult vp<[[CLAMPED_VF]]>, ir<2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.vf> +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%wide.trip.count>, vp<[[CLAMPED_VF]]> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%wide.trip.count>, vp<%n.mod.vf> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: CLONE ir<[[VEC_PTR_A:%.+]]> = getelementptr inbounds ir<%a>, vp<%index> +; CHECK-NEXT: WIDEN ir<[[VEC_A:%.+]]> = load ir<[[VEC_PTR_A]]>, vp<[[ALIAS_MASK]]> +; CHECK-NEXT: CLONE ir<[[VEC_PTR_B:%.+]]> = getelementptr inbounds ir<%b>, vp<%index> +; CHECK-NEXT: WIDEN ir<[[VEC_B:%.+]]> = load ir<[[VEC_PTR_B]]>, vp<[[ALIAS_MASK]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[VEC_B]]>, ir<[[VEC_A]]> +; CHECK-NEXT: CLONE ir<[[VEC_PTR_C:%.+]]> = getelementptr inbounds ir<%c>, vp<%index> +; CHECK-NEXT: WIDEN store ir<[[VEC_PTR_C]]>, ir<[[ADD]]>, vp<[[ALIAS_MASK]]> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, vp<[[CLAMPED_VF]]> +; CHECK-NEXT: EMIT vp<[[EXIT_COND:%.+]]> = icmp eq vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<[[EXIT_COND]]> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%wide.trip.count>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%n.vec>, middle.block ], [ ir<0>, ir-bb ], [ ir<0>, vector.min.vf.check ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb) +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv +; CHECK-NEXT: IR %2 = load i8, ptr %arrayidx, align 1 +; CHECK-NEXT: IR %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv +; CHECK-NEXT: IR %3 = load i8, ptr %arrayidx2, align 1 +; CHECK-NEXT: IR %add = add i8 %3, %2 +; CHECK-NEXT: IR %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv +; CHECK-NEXT: IR store i8 %add, ptr %arrayidx6, align 1 +; CHECK-NEXT: IR %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK-TF-LABEL: 'alias_mask' +; CHECK-TF: VPlan 'Initial VPlan for VF={2,4,8,16},UF>=1' { +; CHECK-TF-NEXT: Live-in vp<[[VF:%.]]> = VF +; CHECK-TF-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-TF-NEXT: Live-in vp<[[ALIAS_MASK:%.]]> = alias-mask +; CHECK-TF-NEXT: vp<[[TC:%.]]> = original trip-count +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: ir-bb: +; CHECK-TF-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) +; CHECK-TF-NEXT: IR %wide.trip.count = zext nneg i32 %n to i64 +; CHECK-TF-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: vector.ph: +; CHECK-TF-NEXT: EMIT vp<%index.part.next> = VF * Part + ir<0> +; CHECK-TF-NEXT: EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, vp<[[TC]]>, ir<1> +; CHECK-TF-NEXT: Successor(s): vector loop +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: vector loop: { +; CHECK-TF-NEXT: vector.body: +; CHECK-TF-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-TF-NEXT: ACTIVE-LANE-MASK-PHI vp<[[LANE_MASK:%.+]]> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next> +; CHECK-TF-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-TF-NEXT: EMIT vp<[[MASK:%.+]]> = and vp<[[LANE_MASK]]>, vp<[[ALIAS_MASK]]> +; CHECK-TF-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> +; CHECK-TF-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer inbounds ir<%arrayidx> +; CHECK-TF-NEXT: WIDEN ir<%0> = load vp<[[PTR_A]]>, vp<[[MASK]]> +; CHECK-TF-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> +; CHECK-TF-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer inbounds ir<%arrayidx2> +; CHECK-TF-NEXT: WIDEN ir<%1> = load vp<[[PTR_B]]>, vp<[[MASK]]> +; CHECK-TF-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; CHECK-TF-NEXT: CLONE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<[[STEPS]]> +; CHECK-TF-NEXT: vp<[[PTR_C:%.+]]> = vector-pointer inbounds ir<%arrayidx6> +; CHECK-TF-NEXT: WIDEN store vp<[[PTR_C]]>, ir<%add>, vp<[[MASK]]> +; CHECK-TF-NEXT: EMIT vp<%index.next> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-TF-NEXT: EMIT vp<[[PART_IDX:%.+]]> = VF * Part + vp<%index.next> +; CHECK-TF-NEXT: EMIT vp<%active.lane.mask.next> = active lane mask vp<[[PART_IDX]]>, vp<[[TC]]>, ir<1> +; CHECK-TF-NEXT: EMIT vp<[[NOT_MASK:%.+]]> = not vp<%active.lane.mask.next> +; CHECK-TF-NEXT: EMIT branch-on-cond vp<[[NOT_MASK]]> +; CHECK-TF-NEXT: No successors +; CHECK-TF-NEXT: } +; CHECK-TF-NEXT: Successor(s): middle.block +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: middle.block: +; CHECK-TF-NEXT: Successor(s): ir-bb +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: ir-bb: +; CHECK-TF-NEXT: No successors +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: scalar.ph: +; CHECK-TF-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ ir<0>, ir-bb ] +; CHECK-TF-NEXT: Successor(s): ir-bb +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: ir-bb: +; CHECK-TF-NEXT: IR %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-TF-NEXT: IR %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv +; CHECK-TF-NEXT: IR %0 = load i8, ptr %arrayidx, align 1 +; CHECK-TF-NEXT: IR %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv +; CHECK-TF-NEXT: IR %1 = load i8, ptr %arrayidx2, align 1 +; CHECK-TF-NEXT: IR %add = add i8 %1, %0 +; CHECK-TF-NEXT: IR %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv +; CHECK-TF-NEXT: IR store i8 %add, ptr %arrayidx6, align 1 +; CHECK-TF-NEXT: IR %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-TF-NEXT: IR %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-TF-NEXT: No successors +; CHECK-TF-NEXT: } + +; CHECK-TF: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4,vscale x 8,vscale x 16},UF={1}' { +; CHECK-TF-NEXT: Live-in ir<%wide.trip.count> = original trip-count +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: ir-bb: +; CHECK-TF-NEXT: IR %wide.trip.count = zext nneg i32 %n to i64 +; CHECK-TF-NEXT: Successor(s): vector.min.vf.check +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: vector.min.vf.check: +; CHECK-TF-NEXT: EMIT-SCALAR vp<[[PTR_B:%.+]]> = inttoptr ir<%b2> to ptr +; CHECK-TF-NEXT: EMIT-SCALAR vp<[[PTR_C:%.+]]> = inttoptr ir<%c1> to ptr +; CHECK-TF-NEXT: WIDEN-INTRINSIC vp<[[ALIAS_MASK:%.+]]> = call llvm.loop.dependence.war.mask(vp<[[PTR_B]]>, vp<[[PTR_C]]>, ir<1>) +; CHECK-TF-NEXT: EMIT vp<[[CLAMPED_VF:%.+]]> = num-active-lanes vp<[[ALIAS_MASK]]> +; CHECK-TF-NEXT: EMIT vp<%cmp.vf> = icmp ult vp<[[CLAMPED_VF]]>, ir<2> +; CHECK-TF-NEXT: EMIT branch-on-cond vp<%cmp.vf> +; CHECK-TF-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: vector.ph: +; CHECK-TF-NEXT: EMIT vp<[[TC_MINUS_VF:%.+]]> = TC > VF ? TC - VF : 0 ir<%wide.trip.count> +; CHECK-TF-NEXT: EMIT vp<%active.lane.mask.entry> = active lane mask ir<0>, ir<%wide.trip.count>, ir<1> +; CHECK-TF-NEXT: Successor(s): vector.body +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: vector.body: +; CHECK-TF-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-TF-NEXT: ACTIVE-LANE-MASK-PHI vp<[[LANE_MASK:%.+]]> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next> +; CHECK-TF-NEXT: EMIT vp<[[MASK:%.+]]> = and vp<[[LANE_MASK]]>, vp<[[ALIAS_MASK]]> +; CHECK-TF-NEXT: CLONE ir<[[VEC_PTR_A:%.+]]> = getelementptr inbounds ir<%a>, vp<%index> +; CHECK-TF-NEXT: WIDEN ir<[[VEC_A:%.+]]> = load ir<[[VEC_PTR_A]]>, vp<[[MASK]]> +; CHECK-TF-NEXT: CLONE ir<[[VEC_PTR_B:%.+]]> = getelementptr inbounds ir<%b>, vp<%index> +; CHECK-TF-NEXT: WIDEN ir<[[VEC_B:%.+]]> = load ir<[[VEC_PTR_B]]>, vp<[[MASK]]> +; CHECK-TF-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[VEC_B]]>, ir<[[VEC_A]]> +; CHECK-TF-NEXT: CLONE ir<[[VEC_PTR_C:%.+]]> = getelementptr inbounds ir<%c>, vp<%index> +; CHECK-TF-NEXT: WIDEN store ir<[[VEC_PTR_C]]>, ir<[[ADD]]>, vp<[[MASK]]> +; CHECK-TF-NEXT: EMIT vp<%index.next> = add vp<%index>, vp<[[CLAMPED_VF]]> +; CHECK-TF-NEXT: EMIT vp<%active.lane.mask.next> = active lane mask vp<%index>, vp<[[TC_MINUS_VF]]>, ir<1> +; CHECK-TF-NEXT: EMIT vp<[[EXIT_COND:%.+]]> = not vp<%active.lane.mask.next> +; CHECK-TF-NEXT: EMIT branch-on-cond vp<[[EXIT_COND]]> +; CHECK-TF-NEXT: Successor(s): middle.block, vector.body +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: middle.block: +; CHECK-TF-NEXT: Successor(s): ir-bb +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: ir-bb: +; CHECK-TF-NEXT: No successors +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: ir-bb: +; CHECK-TF-NEXT: Successor(s): ir-bb +; CHECK-TF-EMPTY: +; CHECK-TF-NEXT: ir-bb: +; CHECK-TF-NEXT: IR %indvars.iv = phi i64 [ 0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: ir<0> from ir-bb) +; CHECK-TF-NEXT: IR %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv +; CHECK-TF-NEXT: IR %0 = load i8, ptr %arrayidx, align 1 +; CHECK-TF-NEXT: IR %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv +; CHECK-TF-NEXT: IR %1 = load i8, ptr %arrayidx2, align 1 +; CHECK-TF-NEXT: IR %add = add i8 %1, %0 +; CHECK-TF-NEXT: IR %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv +; CHECK-TF-NEXT: IR store i8 %add, ptr %arrayidx6, align 1 +; CHECK-TF-NEXT: IR %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-TF-NEXT: IR %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-TF-NEXT: No successors +; CHECK-TF-NEXT: } + +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv + %1 = load i8, ptr %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv + store i8 %add, ptr %arrayidx6, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll new file mode 100644 index 0000000000000..dc9c977c0d20c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll @@ -0,0 +1,171 @@ +; RUN: opt -S -debug-only=loop-vectorize -force-vector-width=4 -passes=loop-vectorize -force-partial-aliasing-vectorization -disable-output %s 2>&1 | FileCheck %s + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: 'alias_mask' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[ALIAS_MASK:%.+]]> = alias-mask +; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) +; CHECK-NEXT: IR %wide.trip.count = zext nneg i32 %n to i64 +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: Successor(s): pred.load +; CHECK-EMPTY: +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[ALIAS_MASK]]> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx> (S->V) +; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE ir<%1> = load ir<%arrayidx2> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[VEC_A:%.+]]> = ir<%0> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[VEC_B:%.+]]> = ir<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): for.body.1 +; CHECK-EMPTY: +; CHECK-NEXT: for.body.1: +; CHECK-NEXT: WIDEN ir<%add> = add vp<[[VEC_B]]>, vp<[[VEC_A]]> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[ALIAS_MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%arrayidx6> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): for.body.2 +; CHECK-EMPTY: +; CHECK-NEXT: for.body.2: +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block + +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%wide.trip.count> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %wide.trip.count = zext nneg i32 %n to i64 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%wide.trip.count>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb, vector.min.vf.check +; CHECK-EMPTY: +; CHECK-NEXT: vector.min.vf.check: +; CHECK-NEXT: EMIT-SCALAR vp<[[PTR_B:%.+]]> = inttoptr ir<%b2> to ptr +; CHECK-NEXT: EMIT-SCALAR vp<[[PTR_C:%.+]]> = inttoptr ir<%c1> to ptr +; CHECK-NEXT: WIDEN-INTRINSIC vp<[[ALIAS_MASK:%.+]]> = call llvm.loop.dependence.war.mask(vp<[[PTR_B]]>, vp<[[PTR_C]]>, ir<1>) +; CHECK-NEXT: EMIT vp<[[CLAMPED_VF:%.+]]> = num-active-lanes vp<[[ALIAS_MASK]]> +; CHECK-NEXT: EMIT vp<%cmp.vf> = icmp ult vp<[[CLAMPED_VF]]>, ir<2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.vf> +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%wide.trip.count>, vp<[[CLAMPED_VF]]> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%wide.trip.count>, vp<%n.mod.vf> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, for.body.2 ] +; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%index>, ir<1>, vp<[[CLAMPED_VF]]> +; CHECK-NEXT: Successor(s): pred.load +; CHECK-EMPTY: +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[ALIAS_MASK]]> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%8> +; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx> (S->V) +; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%8> +; CHECK-NEXT: REPLICATE ir<%1> = load ir<%arrayidx2> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[VEC_A:%.+]]> = ir<%0> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[VEC_B:%.+]]> = ir<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): for.body.1 +; CHECK-EMPTY: +; CHECK-NEXT: for.body.1: +; CHECK-NEXT: WIDEN ir<%add> = add vp<[[VEC_B]]>, vp<[[VEC_A]]> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[ALIAS_MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%8> +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%arrayidx6> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): for.body.2 +; CHECK-EMPTY: +; CHECK-NEXT: for.body.2: +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, vp<[[CLAMPED_VF]]> +; CHECK-NEXT: EMIT vp<[[EXIT_COND:%.+]]> = icmp eq vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<[[EXIT_COND]]> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: + +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv + %1 = load i8, ptr %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv + store i8 %add, ptr %arrayidx6, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll index 0d923183e251a..2a77ae3609ad7 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll @@ -478,12 +478,12 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count ; CHECK-NEXT: vp<[[VP3:%[0-9]+]]> = original trip-count ; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060 -; CHECK-NEXT: IR %inc = add i64 %div, 1 -; CHECK-NEXT: EMIT vp<[[VP3]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) -; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) -; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[VP4:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) +; CHECK-NEXT: EMIT vp<[[VP3]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) +; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060 +; CHECK-NEXT: IR %inc = add i64 %div, 1 +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: vp<[[VP5:%[0-9]+]]> = DERIVED-IV ir<0> + vp<[[VP2]]> * vp<[[VP4]]> diff --git a/llvm/test/Transforms/LoopVectorize/alias-mask.ll b/llvm/test/Transforms/LoopVectorize/alias-mask.ll new file mode 100644 index 0000000000000..22a9d75d28da6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/alias-mask.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -S -force-partial-aliasing-vectorization -force-vector-width=2 -passes=loop-vectorize %s | FileCheck %s +; RUN: opt -S -force-partial-aliasing-vectorization -force-vector-interleave=2 -force-vector-width=2 -passes=loop-vectorize %s | FileCheck %s +; RUN: opt -S -force-partial-aliasing-vectorization -epilogue-vectorization-force-VF=2 -force-vector-interleave=2 -force-vector-width=2 -passes=loop-vectorize %s | FileCheck %s + +; Note: -force-vector-interleave and -epilogue-vectorization-force-VF does not +; change the results as alias-masking is not supported with interleaving or +; epilogue vectorization. + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define void @alias_mask( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B3:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[C2]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr [[TMP12]], ptr [[TMP9]], i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[CMP_VF:%.*]] = icmp ult i64 [[NUM_ACTIVE_LANES]], 2 +; CHECK-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX1]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i8> poison, i8 [[TMP28]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i8> poison, i8 [[TMP33]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i8> [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i8> [ poison, %[[VECTOR_BODY]] ], [ [[TMP34]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP36]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> [[TMP14]], i8 [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i8> [[TMP35]], i8 [[TMP21]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i8> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i8> [ [[TMP35]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP22]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP2]], <2 x i8> [[TMP23]], <2 x i8> splat (i8 1) +; CHECK-NEXT: [[TMP26:%.*]] = sdiv <2 x i8> [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i8> [[TMP26]], i32 0 +; CHECK-NEXT: store i8 [[TMP29]], ptr [[TMP15]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_IF5]]: +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i8> [[TMP26]], i32 1 +; CHECK-NEXT: store i8 [[TMP32]], ptr [[TMP31]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_CONTINUE6]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; + +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %div = sdiv i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %div, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + +; Checks using a scalar outside the loop, with requires extracting the last +; active element. +define i8 @alias_masking_exit_value(ptr %ptrA, ptr %ptrB) { +; CHECK-LABEL: define i8 @alias_masking_exit_value( +; CHECK-SAME: ptr [[PTRA:%.*]], ptr [[PTRB:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PTRA2:%.*]] = ptrtoaddr ptr [[PTRA]] to i64 +; CHECK-NEXT: [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64 +; CHECK-NEXT: br label %[[VECTOR_MIN_VF_CHECK:.*]] +; CHECK: [[VECTOR_MIN_VF_CHECK]]: +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[PTRA2]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[PTRB1]] to ptr +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr [[TMP2]], ptr [[TMP3]], i64 1) +; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i1> [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP5]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[NUM_ACTIVE_LANES]] to i32 +; CHECK-NEXT: [[CMP_VF:%.*]] = icmp ult i32 [[TMP7]], 2 +; CHECK-NEXT: br i1 [[CMP_VF]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP7]] to i8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] +; CHECK-NEXT: [[INDEX:%.*]] = add i32 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX1]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i8> poison, i8 [[TMP29]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i8> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> [[TMP13]], i8 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i8> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i8> [[VEC_IND]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP19]], i32 0 +; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP11]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_IF5]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i8> [[TMP19]], i32 1 +; CHECK-NEXT: store i8 [[TMP25]], ptr [[TMP24]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_CONTINUE6]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX1]], [[TMP7]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP27:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP27]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i8> [[TMP19]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gepA = getelementptr inbounds i8, ptr %ptrA, i32 %iv + %gepB = getelementptr inbounds i8, ptr %ptrB, i32 %iv + %loadA = load i8, ptr %gepA + %iv.trunc = trunc i32 %iv to i8 + %add = add i8 %iv.trunc, %loadA + store i8 %add, ptr %gepB + %iv.next = add nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + %exit.value = phi i8 [ %add, %loop ] + ret i8 %exit.value +} + +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 6c3e7575c6c32..ed5015306b227 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -437,8 +437,8 @@ define i64 @ivopt_widen_ptr_indvar_1(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_1( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -522,8 +522,8 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_2( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -629,8 +629,8 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_3( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index 55c73cb0928ff..c97fc36ac76d1 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -205,10 +205,15 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_2_PREHEADER]]: ; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 2, [[STEP]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP0]] +; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]] ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[STEP]], -2 -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]] ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1) @@ -217,11 +222,6 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP16:%.*]] = sub i32 2, [[STEP]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]] -; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; CHECK: [[VECTOR_SCEVCHECK]]: