diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index f7055dbde275b..3ea43e8079bae 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -171,6 +171,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, return (ScalarOpdIdx == 2); case Intrinsic::experimental_vp_splice: return ScalarOpdIdx == 2 || ScalarOpdIdx == 4; + case Intrinsic::loop_dependence_war_mask: + return true; default: return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9c34781bb7447..aac4d77b79087 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized"); +STATISTIC(LoopsPartialAliasVectorized, + "Number of partial aliasing loops vectorized"); static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, @@ -205,6 +207,10 @@ static cl::opt ForceTargetSupportsMaskedMemoryOps( cl::desc("Assume the target supports masked memory operations (used for " "testing).")); +static cl::opt ForcePartialAliasingVectorization( + "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden, + cl::desc("Replace pointer diff checks with alias masks.")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -876,6 +882,8 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedUsePredicate }; +enum class AliasMaskingStatus { NotDecided, Disabled, Enabled }; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -1382,6 +1390,64 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + void tryToEnablePartialAliasMasking() { + assert(foldTailByMasking() && "Expected tail folding to be enabled!"); + assert(!foldTailWithEVL() && + "Did not expect to enable alias masking with EVL!"); + assert(PartialAliasMaskingStatus == AliasMaskingStatus::NotDecided); + + // Assume we fail to enable alias masking (in case we early exit). + PartialAliasMaskingStatus = AliasMaskingStatus::Disabled; + + // Note: FixedOrderRecurrences are not supported yet as we cannot handle + // the required `splice.right` with the alias-mask. + if (!ForcePartialAliasingVectorization || + !Legal->getFixedOrderRecurrences().empty()) + return; + + const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking(); + if (!Checks) + return; + + auto DiffChecks = Checks->getDiffChecks(); + if (!DiffChecks || DiffChecks->empty()) + return; + + [[maybe_unused]] auto HasPointerArgs = [](CallBase *CB) { + return any_of(CB->args(), [](Value const *Arg) { + return Arg->getType()->isPointerTy(); + }); + }; + + for (BasicBlock *BB : TheLoop->blocks()) { + for (Instruction &I : *BB) { + if (!isa(I)) { + [[maybe_unused]] auto *Call = dyn_cast(&I); + assert((!I.mayReadOrWriteMemory() || Call && !HasPointerArgs(Call)) && + "Skipped unexpected memory access"); + continue; + } + + Type *ScalarTy = getLoadStoreType(&I); + Value *Ptr = getLoadStorePointerOperand(&I); + + // Currently, we can't handle alias masking in reverse. Reversing the + // alias mask is not correct (or necessary). When combined with + // tail-folding the active lane mask should only be reversed where the + // alias-mask is true. + if (Legal->isConsecutivePtr(ScalarTy, Ptr) == -1) + return; + } + } + + PartialAliasMaskingStatus = AliasMaskingStatus::Enabled; + } + + /// Returns true if all loop blocks should have partial aliases masked. + bool maskPartialAliasing() const { + return PartialAliasMaskingStatus == AliasMaskingStatus::Enabled; + } + /// Returns true if the use of wide lane masks is requested and the loop is /// using tail-folding with a lane mask for control flow. bool useWideActiveLaneMask() const { @@ -1499,6 +1565,21 @@ class LoopVectorizationCostModel { /// initialized during object construction. std::optional VScaleForTuning; + bool isUniform(Value *V, ElementCount VF) const { + // With alias-masking our runtime VF is [2, VF] (and not necessarily a + // power-of-two). Something that is uniform for VF may not be for the full + // range. + assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided); + return PartialAliasMaskingStatus == AliasMaskingStatus::Disabled && + Legal->isUniform(V, VF); + } + + bool isUniformMemOp(Instruction &I, ElementCount VF) const { + assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided); + return PartialAliasMaskingStatus == AliasMaskingStatus::Disabled && + Legal->isUniformMemOp(I, VF); + } + /// Initializes the value of vscale used for tuning the cost model. If /// vscale_range.min == vscale_range.max then return vscale_range.max, else /// return the value returned by the corresponding TTI method. @@ -1603,6 +1684,9 @@ class LoopVectorizationCostModel { /// Control finally chosen tail folding style. TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None; + /// If partial alias masking is enabled/disabled or not decided. + AliasMaskingStatus PartialAliasMaskingStatus = AliasMaskingStatus::NotDecided; + /// true if scalable vectorization is supported and enabled. std::optional IsScalableVectorizationAllowed; @@ -1824,14 +1908,18 @@ class GeneratedRTChecks { /// The kind of cost that we are calculating TTI::TargetCostKind CostKind; + /// True if the loop is alias-masked (which allows us to omit diff checks). + bool LoopUsesAliasMasking = false; + public: GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, - TTI::TargetCostKind CostKind) + TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking) : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false), MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false), - PSE(PSE), CostKind(CostKind) {} + PSE(PSE), CostKind(CostKind), + LoopUsesAliasMasking(LoopUsesAliasMasking) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -1884,7 +1972,10 @@ class GeneratedRTChecks { } const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); - if (RtPtrChecking.Need) { + // TODO: We need to estimate the cost of alias-masking in + // GeneratedRTChecks::getCost(). We can't check the MemCheckBlock as the + // alias-mask is generated later in VPlan. + if (RtPtrChecking.Need && !LoopUsesAliasMasking) { auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); @@ -3148,7 +3239,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (Iter != Uniforms.end() && !Iter->second.contains(I)) return false; } - if (!Legal->isUniformMemOp(*I, VF)) + if (!isUniformMemOp(*I, VF)) return false; if (isa(I)) // Loading the same address always produces the same result - at least @@ -3225,7 +3316,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // If the pointer can be proven to be uniform, always add it to the // worklist. - if (isa(Ptr) && Legal->isUniform(Ptr, VF)) + if (isa(Ptr) && isUniform(Ptr, VF)) AddToWorklistIfAllowed(cast(Ptr)); if (IsUniformMemOpUse(&I)) @@ -3544,6 +3635,12 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { + // Make sure once we return PartialAliasMaskingStatus is not "NotDecided". + scope_exit EnsureAliasMaskingStatusIsDecidedOnReturn([this] { + if (PartialAliasMaskingStatus == AliasMaskingStatus::NotDecided) + PartialAliasMaskingStatus = AliasMaskingStatus::Disabled; + }); + if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may be useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -3717,6 +3814,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { assert(ContainsScalableVF && "Expected scalable vector factor."); MaxFactors.FixedVF = ElementCount::getFixed(1); + } else { + tryToEnablePartialAliasMasking(); } return MaxFactors; } @@ -4440,6 +4539,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } + if (CM.maskPartialAliasing()) { + LLVM_DEBUG( + dbgs() + << "LEV: Epilogue vectorization not supported with alias masking.\n"); + return Result; + } + // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. if (!isCandidateForEpilogueVectorization(MainLoopVF)) { @@ -5363,7 +5469,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { - assert(Legal->isUniformMemOp(*I, VF)); + assert(isUniformMemOp(*I, VF)); Type *ValTy = getLoadStoreType(I); Type *PtrTy = getLoadStorePointerOperand(I)->getType(); @@ -5402,7 +5508,7 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = Ptr->getType(); - if (!Legal->isUniform(Ptr, VF)) + if (!isUniform(Ptr, VF)) PtrTy = toVectorTy(PtrTy, VF); unsigned IID = I->getOpcode() == Instruction::Load @@ -5732,7 +5838,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { if (isa(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; - if (Legal->isUniformMemOp(I, VF)) { + if (isUniformMemOp(I, VF)) { auto IsLegalToScalarize = [&]() { if (!VF.isScalable()) // Scalarization of fixed length vectors "just works". @@ -5903,7 +6009,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { InstWidening Decision = getWideningDecision(I, VF); if (!isPredicatedInst(I) && (Decision == CM_Widen || Decision == CM_Widen_Reverse || - (!Legal->isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) { + (!isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) { // Scalarize a widened load of address or update the cost of a scalar // load of an address. setWideningDecision( @@ -7388,6 +7494,14 @@ DenseMap LoopVectorizationPlanner::executePlan( // compactness. attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); + if (CM.maskPartialAliasing()) { + assert(CM.foldTailByMasking() && "Expected tail folding to be enabled"); + VPlanTransforms::materializeAliasMaskCheckBlock( + BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(), + HasBranchWeights); + ++LoopsPartialAliasVectorized; + } + // Retrieving VectorPH now when it's easier while VPlan still has Regions. VPBasicBlock *VectorPH = cast(BestVPlan.getVectorPreheader()); @@ -8378,6 +8492,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow); } + if (CM.maskPartialAliasing()) + RUN_VPLAN_PASS(VPlanTransforms::attachAliasMaskToHeaderMask, *Plan); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -8753,7 +8870,10 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getPlanFor(VF.Width); { - GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind); + assert(!CM.maskPartialAliasing() && + "Did not expect to alias-mask outer loop"); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind, + /*UsesAliasMasking=*/false); InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -9582,7 +9702,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (ORE->allowExtraAnalysis(LV_NAME)) LVP.emitInvalidCostRemarks(ORE); - GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind, + CM.maskPartialAliasing()); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); @@ -9692,6 +9813,17 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Override IC if user provided an interleave count. IC = UserIC > 0 ? UserIC : IC; + if (CM.maskPartialAliasing()) { + LLVM_DEBUG( + dbgs() + << "LV: Not interleaving due to partial aliasing vectorization.\n"); + IntDiagMsg = { + "PartialAliasingVectorization", + "Unable to interleave due to partial aliasing vectorization."}; + InterleaveLoop = false; + IC = 1; + } + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9ed9d07151d7f..0929290baaed9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1226,6 +1226,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, // The size of the mask returned is VF * Multiplier (UF, third op). ActiveLaneMask, ExplicitVectorLength, + // Represents the incoming loop-invariant alias-mask. All memory accesses + // in the loop must stay within the active lanes. + IncomingAliasMask, CalculateTripCountMinusVF, // Increment the canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, @@ -1265,8 +1268,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, // part if it is scalar. In the latter case, the recipe will be removed // during unrolling. ExtractPenultimateElement, - LogicalAnd, // Non-poison propagating logical And. - LogicalOr, // Non-poison propagating logical Or. + LogicalAnd, // Non-poison propagating logical And. + LogicalOr, // Non-poison propagating logical Or. + NumActiveLanes, // Counts the number of active lanes in a mask. // Add an offset in bytes (second operand) to a base pointer (first // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 9667227a329ac..fabbcc6cdc5fc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -110,6 +110,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return inferScalarType(R->getOperand(1)); case VPInstruction::ExplicitVectorLength: return Type::getIntNTy(Ctx, 32); + case VPInstruction::IncomingAliasMask: + return IntegerType::get(Ctx, 1); case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: @@ -123,6 +125,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return inferScalarType(R->getOperand(1)); case VPInstruction::FirstActiveLane: case VPInstruction::LastActiveLane: + case VPInstruction::NumActiveLanes: // Assume that the maximum possible number of elements in a vector fits // within the index type for the default address space. return DL.getIndexType(Ctx, 0); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 2747b7e8c92cb..c89daa5c6a59e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -1207,13 +1207,19 @@ static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB, } } +void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *Cond, + VPBasicBlock *CheckBlock, + bool AddBranchWeights) { + insertCheckBlockBeforeVectorLoop(Plan, CheckBlock); + addBypassBranch(Plan, CheckBlock, Cond, AddBranchWeights); +} + void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights) { VPValue *CondVPV = Plan.getOrAddLiveIn(Cond); VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock); - insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB); - addBypassBranch(Plan, CheckBlockVPBB, CondVPV, AddBranchWeights); + attachCheckBlock(Plan, CondVPV, CheckBlockVPBB, AddBranchWeights); } void VPlanTransforms::addMinimumIterationCheck( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4748b880721fa..cc1c7e97c1b54 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -447,6 +447,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const { switch (Opcode) { case VPInstruction::StepVector: case VPInstruction::VScale: + case VPInstruction::IncomingAliasMask: return 0; case Instruction::Alloca: case Instruction::ExtractValue: @@ -464,6 +465,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const { case VPInstruction::ResumeForEpilogue: case VPInstruction::Reverse: case VPInstruction::Unpack: + case VPInstruction::NumActiveLanes: return 1; case Instruction::ICmp: case Instruction::FCmp: @@ -611,6 +613,20 @@ Value *VPInstruction::generate(VPTransformState &State) { {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); } + case VPInstruction::NumActiveLanes: { + Value *Op = State.get(getOperand(0)); + auto *VecTy = cast(Op->getType()); + assert(VecTy->getScalarSizeInBits() == 1 && + "NumActiveLanes only implemented for i1 vectors"); + + Type *Ty = State.TypeAnalysis.inferScalarType(this); + Value *ZExt = Builder.CreateCast( + Instruction::ZExt, Op, + VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount())); + Value *Count = + Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt); + return Builder.CreateCast(Instruction::ZExt, Count, Ty, "num.active.lanes"); + } case VPInstruction::FirstOrderRecurrenceSplice: { // Generate code to combine the previous and current values in vector v3. // @@ -1282,7 +1298,8 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ExtractLastActive || getOpcode() == VPInstruction::ComputeReductionResult || - getOpcode() == VPInstruction::AnyOf; + getOpcode() == VPInstruction::AnyOf || + getOpcode() == VPInstruction::NumActiveLanes; } bool VPInstruction::isSingleScalar() const { @@ -1355,6 +1372,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractLastPart: case VPInstruction::ExtractPenultimateElement: case VPInstruction::ActiveLaneMask: + case VPInstruction::IncomingAliasMask: case VPInstruction::ExitingIVValue: case VPInstruction::ExplicitVectorLength: case VPInstruction::FirstActiveLane: @@ -1478,6 +1496,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; + case VPInstruction::IncomingAliasMask: + O << "incoming-alias-mask"; + break; case VPInstruction::ExplicitVectorLength: O << "EXPLICIT-VECTOR-LENGTH"; break; @@ -1568,6 +1589,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, case VPInstruction::ExtractLastActive: O << "extract-last-active"; break; + case VPInstruction::NumActiveLanes: + O << "num-active-lanes"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e709cce1bae44..8c20c34e7a221 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1149,7 +1149,7 @@ void VPlanTransforms::optimizeInductionLiveOutUsers( } } -/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing +/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing /// them with already existing recipes expanding the same SCEV expression. static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { DenseMap SCEV2VPV; @@ -1163,7 +1163,11 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR); if (Inserted) continue; + ExpR->replaceAllUsesWith(V->second); + if (ExpR == Plan.getTripCount()) + Plan.resetTripCount(V->second); + ExpR->eraseFromParent(); } } @@ -5141,8 +5145,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC); VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin(); if (auto *StepR = Step->getDefiningRecipe()) { - assert(StepR->getParent() == VectorPHVPBB && - "Step must be defined in VectorPHVPBB"); + assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) && + "Step VPBB must dominate VectorPHVPBB"); // Insert after Step's definition to maintain valid def-use ordering. InsertPt = std::next(StepR->getIterator()); } @@ -5228,6 +5232,116 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, VFxUF.replaceAllUsesWith(MulByUF); } +void VPlanTransforms::attachAliasMaskToHeaderMask(VPlan &Plan) { + VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan); + auto *HeaderMaskDef = HeaderMask->getDefiningRecipe(); + + VPBuilder Builder(Plan.getVectorPreheader()); + auto *AliasMask = Builder.createNaryOp(VPInstruction::IncomingAliasMask, {}); + + if (HeaderMaskDef->isPhi()) + Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi()); + else + Builder = VPBuilder::getToInsertAfter(HeaderMaskDef); + + // Update all existing users of the header mask to "HeaderMask & AliasMask". + auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask); + HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) { + return dyn_cast(&U) != ClampedHeaderMask; + }); +} + +VPValue * +VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, + ArrayRef DiffChecks) { + VPBuilder Builder(AliasCheckVPBB); + Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext()); + Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext()); + Type *PtrTy = PointerType::getUnqual(Plan.getContext()); + + VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan); + assert(IncomingAliasMask && "Expected an alias mask!"); + + VPValue *AliasMask = nullptr; + for (PointerDiffInfo Check : DiffChecks) { + VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart); + VPValue *Sink = + vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart); + + VPValue *SrcPtr = + Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, PtrTy, + DebugLoc::getCompilerGenerated()); + VPValue *SinkPtr = + Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, PtrTy, + DebugLoc::getCompilerGenerated()); + + VPWidenIntrinsicRecipe *WARMask = new VPWidenIntrinsicRecipe( + Intrinsic::loop_dependence_war_mask, + {SrcPtr, SinkPtr, Plan.getConstantInt(I64Ty, Check.AccessSize)}, I1Ty); + Builder.insert(WARMask); + + if (AliasMask) + AliasMask = Builder.createAnd(AliasMask, WARMask); + else + AliasMask = WARMask; + } + + Type *IVTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); + VPValue *NumActive = + Builder.createNaryOp(VPInstruction::NumActiveLanes, {AliasMask}); + VPValue *ClampedVF = Builder.createScalarZExtOrTrunc( + NumActive, IVTy, I64Ty, DebugLoc::getCompilerGenerated()); + + IncomingAliasMask->replaceAllUsesWith(AliasMask); + + return ClampedVF; +} + +void VPlanTransforms::materializeAliasMaskCheckBlock( + VPlan &Plan, ArrayRef DiffChecks, bool HasBranchWeights) { + VPBasicBlock *ClampedVFCheck = + Plan.createVPBasicBlock("vector.clamped.vf.check"); + + VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks); + VPBuilder Builder(ClampedVFCheck); + DebugLoc DL = DebugLoc::getCompilerGenerated(); + Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); + + // Check the "ClampedVF" from the alias mask is larger than one. + VPValue *IsScalar = + Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF, + Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar"); + + VPValue *TripCount = Plan.getTripCount(); + VPValue *MaxUIntTripCount = + Plan.getConstantInt(cast(TCTy)->getMask()); + VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount); + + // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF. + // Note: The ClampedVF may not be a power-of-two. This means the loop exit + // condition (index.next == n.vec) may not be correct in the case of an + // overflow. The issue is `n.vec` could be zero due to an overflow, but + // index.next is not guaranteed to overflow to zero as the ClampedVF is not a + // power-of-two). + VPValue *TripCountCheck = Builder.createICmp( + ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow"); + + VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL); + attachCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights); + + // Materialize the trip count early as this will add a use of (VFxUF) that + // needs to be replaced with the ClampedVF. + materializeVectorTripCount(Plan, Plan.getVectorPreheader(), + /*TailByMasking=*/true, + /*RequiresScalarEpilogue=*/false, + &Plan.getVFxUF()); + + assert(Plan.getConcreteUF() == 1 && + "Clamped VF not supported with interleaving"); + Plan.getVF().replaceAllUsesWith(ClampedVF); + Plan.getVFxUF().replaceAllUsesWith(ClampedVF); +} + DenseMap VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 21a1db22126a0..4f1b42d32b924 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -183,6 +183,8 @@ struct VPlanTransforms { /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a /// VPValue and connect the block to \p Plan, using the VPValue as branch /// condition. + static void attachCheckBlock(VPlan &Plan, VPValue *Cond, + VPBasicBlock *CheckBlock, bool AddBranchWeights); static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights); @@ -427,6 +429,21 @@ struct VPlanTransforms { static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF); + /// Attaches the alias-mask to the existing header-mask. + static void attachAliasMaskToHeaderMask(VPlan &Plan); + + /// Materializes within the \p AliasCheckVPBB block. Updates the header mask + /// of the loop to use the alias mask. Returns the clamped VF. + static VPValue *materializeAliasMask(VPlan &Plan, + VPBasicBlock *AliasCheckVPBB, + ArrayRef DiffChecks); + + /// Materializes the alias mask within a check block before the loop. The + /// vector loop will only be entered if the clamped VF from the alias mask + /// is not scalar. + static void materializeAliasMaskCheckBlock( + VPlan &Plan, ArrayRef DiffChecks, bool HasBranchWeights); + /// Expand VPExpandSCEVRecipes in \p Plan's entry block. Each /// VPExpandSCEVRecipe is replaced with a live-in wrapping the expanded IR /// value. A mapping from SCEV expressions to their expanded IR value is diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 7921a6fa0411e..ecb6c95137fdc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -46,7 +46,8 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { if (U && !isa(U->getValue())) return Plan.getOrAddLiveIn(U->getValue()); auto *Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded); + VPBasicBlock *EntryVPBB = Plan.getEntry(); + EntryVPBB->insert(Expanded, EntryVPBB->getFirstNonPhi()); return Expanded; } @@ -79,8 +80,15 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { return true; } - return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && - B == Plan.getBackedgeTakenCount(); + if (match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A)) + return B == Plan.getBackedgeTakenCount(); + + if (match(V, + m_c_BinaryAnd(m_VPValue(), + m_VPInstruction()))) + return true; + + return false; } /// Returns true if \p R propagates poison from any operand to its result. @@ -568,6 +576,13 @@ vputils::getRecipesForUncountableExit(VPlan &Plan, } VPSingleDefRecipe *vputils::findHeaderMask(VPlan &Plan) { + if (VPValue *AliasMask = findIncomingAliasMask(Plan)) { + assert(match(AliasMask->getSingleUser(), + m_c_BinaryAnd(m_VPValue(), m_Specific(AliasMask))) && + "AliasMask must be only be used with the original header mask"); + return cast(AliasMask->getSingleUser()); + } + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); SmallVector WideCanonicalIVs; auto *FoundWidenCanonicalIVUser = find_if( @@ -606,9 +621,25 @@ VPSingleDefRecipe *vputils::findHeaderMask(VPlan &Plan) { HeaderMask = VPI; } } + + for (VPRecipeBase &R : LoopRegion->getEntryBasicBlock()->phis()) { + auto *Def = cast(&R); + if (vputils::isHeaderMask(Def, Plan)) { + assert(!HeaderMask && "Multiple header masks found?"); + HeaderMask = Def; + } + } + return HeaderMask; } +VPValue *vputils::findIncomingAliasMask(const VPlan &Plan) { + for (VPRecipeBase &R : *const_cast(Plan).getVectorPreheader()) + if (match(&R, m_VPInstruction())) + return cast(&R); + return nullptr; +} + bool VPBlockUtils::isHeader(const VPBlockBase *VPB, const VPDominatorTree &VPDT) { auto *VPBB = dyn_cast(VPB); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index c4cacebcd78ba..040a886c1022e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -147,10 +147,15 @@ VPInstruction *findComputeReductionResult(VPReductionPHIRecipe *PhiR); /// Collect the header mask with the pattern: /// (ICMP_ULE, WideCanonicalIV, backedge-taken-count) +/// Note: If alias masking is enabled this will find: +/// (AND, HeaderMask, AliasMask) /// TODO: Introduce explicit recipe for header-mask instead of searching /// the header-mask pattern manually. VPSingleDefRecipe *findHeaderMask(VPlan &Plan); +/// Finds the incoming alias-mask within the vector preheader. +VPValue *findIncomingAliasMask(const VPlan &Plan); + } // namespace vputils //===----------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll new file mode 100644 index 0000000000000..065bb58f227e1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias-mask.ll @@ -0,0 +1,454 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize %s | FileCheck %s --check-prefix=CHECK-TF + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-TF-LABEL: define void @alias_mask( +; CHECK-TF-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-TF-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK-TF: [[FOR_BODY_PREHEADER]]: +; CHECK-TF-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP0:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-TF-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-TF-NEXT: [[TMP3:%.*]] = zext [[ALIAS_MASK]] to +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP3]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-TF-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-TF-NEXT: [[TMP5:%.*]] = sub i64 -1, [[N]] +; CHECK-TF-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP5]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-TF-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP10:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_MASK]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = select [[TMP10]], [[WIDE_MASKED_LOAD]], splat (i8 1) +; CHECK-TF-NEXT: [[TMP14:%.*]] = sdiv [[WIDE_MASKED_LOAD3]], [[TMP13]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP14]], ptr align 1 [[TMP15]], [[TMP10]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +; CHECK-TF-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %div = sdiv i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %div, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + +; Note: This test could emit a `llvm.loop.dependence.raw` mask to avoid creating +; a dependency between the store and the load, but it is not necessary for +; correctness. +define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-TF-LABEL: define i32 @alias_mask_read_after_write( +; CHECK-TF-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[C2:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-TF-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP19]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK-TF: [[FOR_BODY_PREHEADER]]: +; CHECK-TF-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP0:%.*]] = inttoptr i64 [[C2]] to ptr +; CHECK-TF-NEXT: [[TMP1:%.*]] = inttoptr i64 [[B1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP0]], ptr [[TMP1]], i64 4) +; CHECK-TF-NEXT: [[TMP3:%.*]] = zext [[ALIAS_MASK]] to +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP3]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-TF-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-TF-NEXT: [[TMP5:%.*]] = sub i64 -1, [[N]] +; CHECK-TF-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP5]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-TF-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP10:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_MASK]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP11]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP12]], [[TMP10]]) +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP13]], [[TMP10]], poison) +; CHECK-TF-NEXT: [[TMP14:%.*]] = add [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = add [[TMP14]], [[WIDE_MASKED_LOAD3]] +; CHECK-TF-NEXT: [[TMP16]] = select [[TMP10]], [[TMP15]], [[VEC_PHI]] +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-TF-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; CHECK-TF-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + + +entry: + %cmp19 = icmp sgt i64 %n, 0 + br i1 %cmp19, label %for.body, label %exit + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ] + %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv + %load.a = load i32, ptr %gep.a, align 2 + %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv + store i32 %load.a, ptr %gep.c, align 2 + %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv + %load.b = load i32, ptr %gep.b, align 2 + %add = add i32 %load.a, %accum + %add2 = add i32 %add, %load.b + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %entry, %for.body + %result = phi i32 [ 0, %entry ], [ %add2, %for.body ] + ret i32 %result +} + +define void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) { +; CHECK-TF-LABEL: define void @alias_mask_multiple( +; CHECK-TF-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[A3:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-TF-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-TF-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK-TF: [[FOR_BODY_PREHEADER]]: +; CHECK-TF-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP0:%.*]] = inttoptr i64 [[A3]] to ptr +; CHECK-TF-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-TF-NEXT: [[TMP2:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-TF-NEXT: [[TMP3:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-TF-NEXT: [[TMP4:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-TF-NEXT: [[TMP5:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP3]], ptr [[TMP4]], i64 1) +; CHECK-TF-NEXT: [[ALIAS_MASK:%.*]] = and [[TMP2]], [[TMP5]] +; CHECK-TF-NEXT: [[TMP7:%.*]] = zext [[ALIAS_MASK]] to +; CHECK-TF-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP7]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-TF-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-TF-NEXT: [[TMP9:%.*]] = sub i64 -1, [[N]] +; CHECK-TF-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP9]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-TF-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP14:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_MASK]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP15]], [[TMP14]], poison) +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP16]], [[TMP14]], poison) +; CHECK-TF-NEXT: [[TMP17:%.*]] = add [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP17]], ptr align 1 [[TMP18]], [[TMP14]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true +; CHECK-TF-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %add = add i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %add, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + +; Checks using a scalar outside the loop, with requires extracting the last +; active element. +define i8 @alias_masking_exit_value(ptr %ptrA, ptr %ptrB) { +; CHECK-TF-LABEL: define i8 @alias_masking_exit_value( +; CHECK-TF-SAME: ptr [[PTRA:%.*]], ptr [[PTRB:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[PTRA2:%.*]] = ptrtoaddr ptr [[PTRA]] to i64 +; CHECK-TF-NEXT: [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64 +; CHECK-TF-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK-TF: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-TF-NEXT: [[TMP0:%.*]] = inttoptr i64 [[PTRA2]] to ptr +; CHECK-TF-NEXT: [[TMP1:%.*]] = inttoptr i64 [[PTRB1]] to ptr +; CHECK-TF-NEXT: [[ALIAS_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-TF-NEXT: [[TMP3:%.*]] = zext [[ALIAS_MASK]] to +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP3]]) +; CHECK-TF-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-TF-NEXT: [[TMP5:%.*]] = trunc i64 [[NUM_ACTIVE_LANES]] to i32 +; CHECK-TF-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8 +; CHECK-TF-NEXT: [[TMP7:%.*]] = mul i8 1, [[TMP6]] +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[TMP7]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i32 [[TMP5]], 1 +; CHECK-TF-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i32 -1001, [[TMP5]] +; CHECK-TF-NEXT: [[TMP8:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-TF-NEXT: br i1 [[TMP8]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1000) +; CHECK-TF-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv16i8() +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP12]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP13:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_MASK]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i32 [[INDEX]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i32 [[INDEX]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[TMP13]], poison) +; CHECK-TF-NEXT: [[TMP16:%.*]] = add [[VEC_IND]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP16]], ptr align 1 [[TMP15]], [[TMP13]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP5]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1000) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-TF-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-TF-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: [[TMP19:%.*]] = xor [[TMP13]], splat (i1 true) +; CHECK-TF-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP19]], i1 false) +; CHECK-TF-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-TF-NEXT: [[TMP20:%.*]] = extractelement [[TMP16]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-TF-NEXT: br [[EXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gepA = getelementptr inbounds i8, ptr %ptrA, i32 %iv + %gepB = getelementptr inbounds i8, ptr %ptrB, i32 %iv + %loadA = load i8, ptr %gepA + %iv.trunc = trunc i32 %iv to i8 + %add = add i8 %iv.trunc, %loadA + store i8 %add, ptr %gepB + %iv.next = add nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + %exit.value = phi i8 [ %add, %loop ] + ret i8 %exit.value +} + +; Unsupported: Reversing the alias mask is not correct. +define void @alias_mask_reverse_iterate(ptr noalias %ptrA, ptr %ptrB, ptr %ptrC, i64 %n) { +; CHECK-TF-LABEL: define void @alias_mask_reverse_iterate( +; CHECK-TF-SAME: ptr noalias [[PTRA:%.*]], ptr [[PTRB:%.*]], ptr [[PTRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[PTRC2:%.*]] = ptrtoaddr ptr [[PTRC]] to i64 +; CHECK-TF-NEXT: [[PTRB1:%.*]] = ptrtoaddr ptr [[PTRB]] to i64 +; CHECK-TF-NEXT: [[IV_START:%.*]] = add i64 [[N]], -1 +; CHECK-TF-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK-TF: [[VECTOR_MEMCHECK]]: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-TF-NEXT: [[TMP2:%.*]] = sub i64 [[PTRB1]], [[PTRC2]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[IV_START]]) +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[IV_START]], [[INDEX]] +; CHECK-TF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[PTRA]], i64 [[OFFSET_IDX]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = sub nuw nsw i64 [[TMP4]], 1 +; CHECK-TF-NEXT: [[TMP10:%.*]] = sub i64 0, [[TMP9]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv16i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[REVERSE]], poison) +; CHECK-TF-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_MASKED_LOAD]]) +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[PTRB]], i64 [[OFFSET_IDX]] +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv16i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP13]], [[REVERSE4]], poison) +; CHECK-TF-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv16i8( [[WIDE_MASKED_LOAD5]]) +; CHECK-TF-NEXT: [[TMP14:%.*]] = add [[REVERSE6]], [[REVERSE3]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[PTRC]], i64 [[OFFSET_IDX]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv16i8( [[TMP14]]) +; CHECK-TF-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv16i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[REVERSE7]], ptr align 1 [[TMP16]], [[REVERSE8]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[IV_START]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-TF-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: br [[EXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; +entry: + %iv.start = add nsw i64 %n, -1 + br label %loop + +loop: + %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds i8, ptr %ptrA, i64 %iv + %loadA = load i8, ptr %gep.A, align 1 + %gep.B = getelementptr inbounds i8, ptr %ptrB, i64 %iv + %loadB = load i8, ptr %gep.B, align 1 + %add = add i8 %loadB, %loadA + %gep.C = getelementptr inbounds i8, ptr %ptrC, i64 %iv + store i8 %add, ptr %gep.C, align 1 + %iv.next = add nsw i64 %iv, -1 + %ec = icmp eq i64 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test taken from: scalable-first-order-recurrence.ll. Check we don't use +; an alias-mask with first-order recurrences, as we cannot handle the +; splice.right with the alias-mask/clamped VF yet. +define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) { +; CHECK-TF-LABEL: define i32 @recurrence_1( +; CHECK-TF-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: [[ENTRY:.*:]] +; CHECK-TF-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-TF-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-TF-NEXT: br label %[[FOR_PREHEADER:.*]] +; CHECK-TF: [[FOR_PREHEADER]]: +; CHECK-TF-NEXT: [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-TF-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-TF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-TF-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK-TF: [[VECTOR_MEMCHECK]]: +; CHECK-TF-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; CHECK-TF-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-TF-NEXT: [[TMP6:%.*]] = add i64 [[B1]], -4 +; CHECK-TF-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], [[A2]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP5]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-TF: [[VECTOR_PH]]: +; CHECK-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 2 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) +; CHECK-TF-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP14:%.*]] = mul nuw i32 [[TMP13]], 4 +; CHECK-TF-NEXT: [[TMP15:%.*]] = sub i32 [[TMP14]], 1 +; CHECK-TF-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 [[PRE_LOAD]], i32 [[TMP15]] +; CHECK-TF-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-TF: [[VECTOR_BODY]]: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_MASKED_LOAD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP17]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP18:%.*]] = call @llvm.vector.splice.right.nxv4i32( [[VECTOR_RECUR]], [[WIDE_MASKED_LOAD]], i32 1) +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP20:%.*]] = add [[WIDE_MASKED_LOAD]], [[TMP18]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP20]], ptr align 4 [[TMP19]], [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP2]]) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-TF: [[MIDDLE_BLOCK]]: +; CHECK-TF-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK]], splat (i1 true) +; CHECK-TF-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( [[TMP23]], i1 false) +; CHECK-TF-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-TF-NEXT: [[TMP24:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], 1 +; CHECK-TF-NEXT: [[TMP25:%.*]] = extractelement [[WIDE_MASKED_LOAD]], i64 [[TMP24]] +; CHECK-TF-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP27:%.*]] = mul nuw i32 [[TMP26]], 4 +; CHECK-TF-NEXT: [[TMP28:%.*]] = sub i32 [[TMP27]], 1 +; CHECK-TF-NEXT: [[TMP29:%.*]] = extractelement [[VECTOR_RECUR]], i32 [[TMP28]] +; CHECK-TF-NEXT: [[TMP30:%.*]] = icmp eq i64 [[LAST_ACTIVE_LANE]], 0 +; CHECK-TF-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP29]], i32 [[TMP25]] +; CHECK-TF-NEXT: br [[FOR_EXIT:label %.*]] +; CHECK-TF: [[SCALAR_PH]]: +; + +entry: + br label %for.preheader + +for.preheader: + %pre_load = load i32, ptr %a + br label %scalar.body + +scalar.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.next + %1 = load i32, ptr %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, ptr %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %scalar.body + +for.exit: + ret i32 %0 +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/expensive-alias-masking.ll b/llvm/test/Transforms/LoopVectorize/AArch64/expensive-alias-masking.ll new file mode 100644 index 0000000000000..7044e652df8da --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/expensive-alias-masking.ll @@ -0,0 +1,85 @@ +; RUN: opt -S -disable-output -mattr=+sve2 -passes=loop-vectorize -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize %s -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize 2>%t +; RUN: cat %t | FileCheck %s -check-prefix=CHECK-ALIAS-MASKING-REMARKS +; RUN: opt -S -disable-output -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize %s -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize 2>%t +; RUN: cat %t | FileCheck %s -check-prefix=CHECK-DIFF-CHECKS-REMARKS + +target triple = "aarch64-unknown-linux-gnu" + +; This loop has four store pointers and eight load pointers. It requires 38 +; diff checks (no store alias = (4*3)/2 = 6 checks, no store alias with load = +; 4 * 8 = 32 checks, for a total of 38 checks). The loops trip count is low (33). +; +; Diff checks are determined to unprofitable due to the high number of checks +; and low trip count. +; +; With alias-masking we do vectorize this loop as the cost of setting up the +; alias-mask is not factored into the vectorization cost. +; +; TODO: Cost the alias-mask when using -force-partial-aliasing-vectorization. + +; CHECK-DIFF-CHECKS-REMARKS: loop not vectorized + +; CHECK-ALIAS-MASKING-REMARKS: vectorized loop (vectorization width: vscale x 4, interleaved count: 1) + +define void @expensive_runtime_checks(ptr %0, ptr %1, ptr %2) { +entry: + %5 = load ptr, ptr %1, align 8 + %6 = load ptr, ptr %2, align 8 + %7 = load ptr, ptr %0, align 8 + %8 = getelementptr inbounds nuw i8, ptr %1, i64 8 + %9 = load ptr, ptr %8, align 8 + %10 = getelementptr inbounds nuw i8, ptr %2, i64 8 + %11 = load ptr, ptr %10, align 8 + %12 = getelementptr inbounds nuw i8, ptr %0, i64 8 + %13 = load ptr, ptr %12, align 8 + %14 = getelementptr inbounds nuw i8, ptr %1, i64 16 + %15 = load ptr, ptr %14, align 8 + %16 = getelementptr inbounds nuw i8, ptr %2, i64 16 + %17 = load ptr, ptr %16, align 8 + %18 = getelementptr inbounds nuw i8, ptr %0, i64 16 + %19 = load ptr, ptr %18, align 8 + %20 = getelementptr inbounds nuw i8, ptr %1, i64 24 + %21 = load ptr, ptr %20, align 8 + %22 = getelementptr inbounds nuw i8, ptr %2, i64 24 + %23 = load ptr, ptr %22, align 8 + %24 = getelementptr inbounds nuw i8, ptr %0, i64 24 + %25 = load ptr, ptr %24, align 8 + br label %loop + +loop: + %27 = phi i64 [ 0, %entry ], [ %52, %loop ] + %28 = getelementptr inbounds nuw [4 x i8], ptr %5, i64 %27 + %29 = load i32, ptr %28, align 4 + %30 = getelementptr inbounds nuw [4 x i8], ptr %6, i64 %27 + %31 = load i32, ptr %30, align 4 + %32 = add nsw i32 %31, %29 + %33 = getelementptr inbounds nuw [4 x i8], ptr %7, i64 %27 + store i32 %32, ptr %33, align 4 + %34 = getelementptr inbounds nuw [4 x i8], ptr %9, i64 %27 + %35 = load i32, ptr %34, align 4 + %36 = getelementptr inbounds nuw [4 x i8], ptr %11, i64 %27 + %37 = load i32, ptr %36, align 4 + %38 = add nsw i32 %37, %35 + %39 = getelementptr inbounds nuw [4 x i8], ptr %13, i64 %27 + store i32 %38, ptr %39, align 4 + %40 = getelementptr inbounds nuw [4 x i8], ptr %15, i64 %27 + %41 = load i32, ptr %40, align 4 + %42 = getelementptr inbounds nuw [4 x i8], ptr %17, i64 %27 + %43 = load i32, ptr %42, align 4 + %44 = add nsw i32 %43, %41 + %45 = getelementptr inbounds nuw [4 x i8], ptr %19, i64 %27 + store i32 %44, ptr %45, align 4 + %46 = getelementptr inbounds nuw [4 x i8], ptr %21, i64 %27 + %47 = load i32, ptr %46, align 4 + %48 = getelementptr inbounds nuw [4 x i8], ptr %23, i64 %27 + %49 = load i32, ptr %48, align 4 + %50 = add nsw i32 %49, %47 + %51 = getelementptr inbounds nuw [4 x i8], ptr %25, i64 %27 + store i32 %50, ptr %51, align 4 + %52 = add nuw nsw i64 %27, 1 + %53 = icmp eq i64 %52, 33 + br i1 %53, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/alias-mask-force-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/alias-mask-force-evl.ll new file mode 100644 index 0000000000000..88f88f5ffdb6c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/alias-mask-force-evl.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -S -mattr=+v -mtriple riscv64 -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl -passes=loop-vectorize %s | FileCheck %s + +; Note: Alias masks are not supported with EVL at the moment. + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define void @alias_mask( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[C1]], [[B2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]] +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[CURRENT_ITERATION_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[CURRENT_ITERATION_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[CURRENT_ITERATION_IV]] +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP4]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[CURRENT_ITERATION_IV]] +; CHECK-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP5]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vp.merge.nxv16i8( splat (i1 true), [[VP_OP_LOAD]], splat (i8 1), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP7:%.*]] = sdiv [[VP_OP_LOAD3]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[CURRENT_ITERATION_IV]] +; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP7]], ptr align 1 [[TMP8]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[CURRENT_ITERATION_NEXT]] = add i64 [[TMP9]], [[CURRENT_ITERATION_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %div = sdiv i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %div, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll new file mode 100644 index 0000000000000..eb884af700b83 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=loop-vectorize -mattr=+sve2 -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-dont-vectorize -disable-output -vplan-print-after="printFinalVPlan$" -S %s 2>&1 | FileCheck --check-prefixes=FINAL %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; FINAL-LABEL: VPlan for loop in 'alias_mask' +; FINAL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4,vscale x 8,vscale x 16},UF={1}' { +; FINAL-NEXT: Live-in ir<%n> = original trip-count +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: IR %b2 = ptrtoaddr ptr %b to i64 +; FINAL-NEXT: IR %c1 = ptrtoaddr ptr %c to i64 +; FINAL-NEXT: Successor(s): vector.clamped.vf.check +; FINAL-EMPTY: +; FINAL-NEXT: vector.clamped.vf.check: +; FINAL-NEXT: EMIT-SCALAR vp<[[VP2:%[0-9]+]]> = inttoptr ir<%b2> to ptr +; FINAL-NEXT: EMIT-SCALAR vp<[[VP3:%[0-9]+]]> = inttoptr ir<%c1> to ptr +; FINAL-NEXT: WIDEN-INTRINSIC vp<[[VP4:%[0-9]+]]> = call llvm.loop.dependence.war.mask(vp<[[VP2]]>, vp<[[VP3]]>, ir<1>) +; FINAL-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = num-active-lanes vp<[[VP4]]> +; FINAL-NEXT: EMIT vp<%vf.is.scalar> = icmp ule vp<[[VP5]]>, ir<1> +; FINAL-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = sub ir<-1>, ir<%n> +; FINAL-NEXT: EMIT vp<%vf.step.overflow> = icmp ult vp<[[VP6]]>, vp<[[VP5]]> +; FINAL-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = or vp<%vf.is.scalar>, vp<%vf.step.overflow> +; FINAL-NEXT: EMIT branch-on-cond vp<[[VP7]]> +; FINAL-NEXT: Successor(s): ir-bb, vector.ph +; FINAL-EMPTY: +; FINAL-NEXT: vector.ph: +; FINAL-NEXT: EMIT vp<%active.lane.mask.entry> = active lane mask ir<0>, ir<%n>, ir<1> +; FINAL-NEXT: Successor(s): vector.body +; FINAL-EMPTY: +; FINAL-NEXT: vector.body: +; FINAL-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; FINAL-NEXT: ACTIVE-LANE-MASK-PHI vp<[[VP9:%[0-9]+]]> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next> +; FINAL-NEXT: EMIT vp<[[VP10:%[0-9]+]]> = and vp<[[VP9]]>, vp<[[VP4]]> +; FINAL-NEXT: CLONE ir<%ptr.a> = getelementptr inbounds ir<%a>, vp<%index> +; FINAL-NEXT: WIDEN ir<%ld.a> = load ir<%ptr.a>, vp<[[VP10]]> +; FINAL-NEXT: CLONE ir<%ptr.b> = getelementptr inbounds ir<%b>, vp<%index> +; FINAL-NEXT: WIDEN ir<%ld.b> = load ir<%ptr.b>, vp<[[VP10]]> +; FINAL-NEXT: WIDEN ir<%add> = add ir<%ld.b>, ir<%ld.a> +; FINAL-NEXT: CLONE ir<%ptr.c> = getelementptr inbounds ir<%c>, vp<%index> +; FINAL-NEXT: WIDEN store ir<%ptr.c>, ir<%add>, vp<[[VP10]]> +; FINAL-NEXT: EMIT vp<%index.next> = add vp<%index>, vp<[[VP5]]> +; FINAL-NEXT: EMIT vp<%active.lane.mask.next> = active lane mask vp<%index.next>, ir<%n>, ir<1> +; FINAL-NEXT: EMIT vp<[[VP11:%[0-9]+]]> = not vp<%active.lane.mask.next> +; FINAL-NEXT: EMIT branch-on-cond vp<[[VP11]]> +; FINAL-NEXT: Successor(s): middle.block, vector.body +; FINAL-EMPTY: +; FINAL-NEXT: middle.block: +; FINAL-NEXT: Successor(s): ir-bb +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: No successors +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: Successor(s): ir-bb +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: IR %iv = phi i64 [ 0, %scalar.ph ], [ %iv.next, %for.body ] (extra operand: ir<0> from ir-bb) +; FINAL-NEXT: IR %ptr.a = getelementptr inbounds i8, ptr %a, i64 %iv +; FINAL-NEXT: IR %ld.a = load i8, ptr %ptr.a, align 1 +; FINAL-NEXT: IR %ptr.b = getelementptr inbounds i8, ptr %b, i64 %iv +; FINAL-NEXT: IR %ld.b = load i8, ptr %ptr.b, align 1 +; FINAL-NEXT: IR %add = add i8 %ld.b, %ld.a +; FINAL-NEXT: IR %ptr.c = getelementptr inbounds i8, ptr %c, i64 %iv +; FINAL-NEXT: IR store i8 %add, ptr %ptr.c, align 1 +; FINAL-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; FINAL-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, %n +; FINAL-NEXT: No successors +; FINAL-NEXT: } +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ptr.a = getelementptr inbounds i8, ptr %a, i64 %iv + %ld.a = load i8, ptr %ptr.a, align 1 + %ptr.b = getelementptr inbounds i8, ptr %b, i64 %iv + %ld.b = load i8, ptr %ptr.b, align 1 + %add = add i8 %ld.b, %ld.a + %ptr.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %add, ptr %ptr.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll new file mode 100644 index 0000000000000..3724112123f95 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-partial-aliasing-vectorization -force-target-supports-masked-memory-ops -prefer-predicate-over-epilogue=predicate-dont-vectorize -disable-output -vplan-print-after="attachAliasMaskToHeaderMask$" -S %s 2>&1 | FileCheck --check-prefix=INITIAL %s +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-partial-aliasing-vectorization -force-target-supports-masked-memory-ops -prefer-predicate-over-epilogue=predicate-dont-vectorize -disable-output -vplan-print-after="printFinalVPlan$" -S %s 2>&1 | FileCheck --check-prefix=FINAL %s + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; INITIAL-LABEL: VPlan for loop in 'alias_mask' +; INITIAL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; INITIAL-NEXT: Live-in vp<[[VP0:%[0-9]+]]> = VF +; INITIAL-NEXT: Live-in vp<[[VP1:%[0-9]+]]> = VF * UF +; INITIAL-NEXT: Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count +; INITIAL-NEXT: Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count +; INITIAL-NEXT: Live-in ir<%n> = original trip-count +; INITIAL-EMPTY: +; INITIAL-NEXT: ir-bb: +; INITIAL-NEXT: Successor(s): scalar.ph, vector.ph +; INITIAL-EMPTY: +; INITIAL-NEXT: vector.ph: +; INITIAL-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = incoming-alias-mask +; INITIAL-NEXT: Successor(s): vector loop +; INITIAL-EMPTY: +; INITIAL-NEXT: vector loop: { +; INITIAL-NEXT: vector.body: +; INITIAL-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; INITIAL-NEXT: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<[[VP0]]> +; INITIAL-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP5]]> +; INITIAL-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = icmp ule vp<[[VP6]]>, vp<[[VP3]]> +; INITIAL-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = and vp<[[VP7]]>, vp<[[VP4]]> +; INITIAL-NEXT: Successor(s): vector.body.split +; INITIAL-EMPTY: +; INITIAL-NEXT: vector.body.split: +; INITIAL-NEXT: CLONE ir<%ptr.a> = getelementptr inbounds ir<%a>, ir<%iv> +; INITIAL-NEXT: vp<[[VP9:%[0-9]+]]> = vector-pointer inbounds ir<%ptr.a> +; INITIAL-NEXT: WIDEN ir<%ld.a> = load vp<[[VP9]]>, vp<[[VP8]]> +; INITIAL-NEXT: CLONE ir<%ptr.b> = getelementptr inbounds ir<%b>, ir<%iv> +; INITIAL-NEXT: vp<[[VP10:%[0-9]+]]> = vector-pointer inbounds ir<%ptr.b> +; INITIAL-NEXT: WIDEN ir<%ld.b> = load vp<[[VP10]]>, vp<[[VP8]]> +; INITIAL-NEXT: WIDEN ir<%add> = add ir<%ld.b>, ir<%ld.a> +; INITIAL-NEXT: CLONE ir<%ptr.c> = getelementptr inbounds ir<%c>, ir<%iv> +; INITIAL-NEXT: vp<[[VP11:%[0-9]+]]> = vector-pointer inbounds ir<%ptr.c> +; INITIAL-NEXT: WIDEN store vp<[[VP11]]>, ir<%add>, vp<[[VP8]]> +; INITIAL-NEXT: CLONE ir<%iv.next> = add nuw nsw ir<%iv>, ir<1> +; INITIAL-NEXT: CLONE ir<%exitcond.not> = icmp eq ir<%iv.next>, ir<%n> +; INITIAL-NEXT: Successor(s): vector.latch +; INITIAL-EMPTY: +; INITIAL-NEXT: vector.latch: +; INITIAL-NEXT: EMIT vp<%index.next> = add vp<[[VP5]]>, vp<[[VP1]]> +; INITIAL-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]> +; INITIAL-NEXT: No successors +; INITIAL-NEXT: } +; INITIAL-NEXT: Successor(s): middle.block +; INITIAL-EMPTY: +; INITIAL-NEXT: middle.block: +; INITIAL-NEXT: EMIT branch-on-cond ir +; INITIAL-NEXT: Successor(s): ir-bb, scalar.ph +; INITIAL-EMPTY: +; INITIAL-NEXT: ir-bb: +; INITIAL-NEXT: No successors +; INITIAL-EMPTY: +; INITIAL-NEXT: scalar.ph: +; INITIAL-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ ir<%n>, middle.block ], [ ir<0>, ir-bb ] +; INITIAL-NEXT: Successor(s): ir-bb +; INITIAL-EMPTY: +; INITIAL-NEXT: ir-bb: +; INITIAL-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; INITIAL-NEXT: IR %ptr.a = getelementptr inbounds i8, ptr %a, i64 %iv +; INITIAL-NEXT: IR %ld.a = load i8, ptr %ptr.a, align 1 +; INITIAL-NEXT: IR %ptr.b = getelementptr inbounds i8, ptr %b, i64 %iv +; INITIAL-NEXT: IR %ld.b = load i8, ptr %ptr.b, align 1 +; INITIAL-NEXT: IR %add = add i8 %ld.b, %ld.a +; INITIAL-NEXT: IR %ptr.c = getelementptr inbounds i8, ptr %c, i64 %iv +; INITIAL-NEXT: IR store i8 %add, ptr %ptr.c, align 1 +; INITIAL-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; INITIAL-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, %n +; INITIAL-NEXT: No successors +; INITIAL-NEXT: } +; +; FINAL-LABEL: VPlan for loop in 'alias_mask' +; FINAL: VPlan 'Final VPlan for VF={4},UF={1}' { +; FINAL-NEXT: Live-in ir<%n> = original trip-count +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: IR %b2 = ptrtoaddr ptr %b to i64 +; FINAL-NEXT: IR %c1 = ptrtoaddr ptr %c to i64 +; FINAL-NEXT: Successor(s): vector.clamped.vf.check +; FINAL-EMPTY: +; FINAL-NEXT: vector.clamped.vf.check: +; FINAL-NEXT: EMIT-SCALAR vp<[[VP2:%[0-9]+]]> = inttoptr ir<%b2> to ptr +; FINAL-NEXT: EMIT-SCALAR vp<[[VP3:%[0-9]+]]> = inttoptr ir<%c1> to ptr +; FINAL-NEXT: WIDEN-INTRINSIC vp<[[VP4:%[0-9]+]]> = call llvm.loop.dependence.war.mask(vp<[[VP2]]>, vp<[[VP3]]>, ir<1>) +; FINAL-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = num-active-lanes vp<[[VP4]]> +; FINAL-NEXT: EMIT vp<%vf.is.scalar> = icmp ule vp<[[VP5]]>, ir<1> +; FINAL-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = sub ir<-1>, ir<%n> +; FINAL-NEXT: EMIT vp<%vf.step.overflow> = icmp ult vp<[[VP6]]>, vp<[[VP5]]> +; FINAL-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = or vp<%vf.is.scalar>, vp<%vf.step.overflow> +; FINAL-NEXT: EMIT branch-on-cond vp<[[VP7]]> +; FINAL-NEXT: Successor(s): ir-bb, vector.ph +; FINAL-EMPTY: +; FINAL-NEXT: vector.ph: +; FINAL-NEXT: EMIT vp<%trip.count.minus.1> = sub ir<%n>, ir<1> +; FINAL-NEXT: EMIT vp<[[VP9:%[0-9]+]]> = sub vp<[[VP5]]>, ir<1> +; FINAL-NEXT: EMIT vp<%n.rnd.up> = add ir<%n>, vp<[[VP9]]> +; FINAL-NEXT: EMIT vp<%n.mod.vf> = urem vp<%n.rnd.up>, vp<[[VP5]]> +; FINAL-NEXT: EMIT vp<%n.vec> = sub vp<%n.rnd.up>, vp<%n.mod.vf> +; FINAL-NEXT: EMIT vp<[[VP10:%[0-9]+]]> = broadcast vp<%trip.count.minus.1> +; FINAL-NEXT: Successor(s): vector.body +; FINAL-EMPTY: +; FINAL-NEXT: vector.body: +; FINAL-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; FINAL-NEXT: EMIT vp<[[VP11:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<%index> +; FINAL-NEXT: EMIT vp<[[VP12:%[0-9]+]]> = icmp ule vp<[[VP11]]>, vp<[[VP10]]> +; FINAL-NEXT: EMIT vp<[[VP13:%[0-9]+]]> = and vp<[[VP12]]>, vp<[[VP4]]> +; FINAL-NEXT: CLONE ir<%ptr.a> = getelementptr inbounds ir<%a>, vp<%index> +; FINAL-NEXT: WIDEN ir<%ld.a> = load ir<%ptr.a>, vp<[[VP13]]> +; FINAL-NEXT: CLONE ir<%ptr.b> = getelementptr inbounds ir<%b>, vp<%index> +; FINAL-NEXT: WIDEN ir<%ld.b> = load ir<%ptr.b>, vp<[[VP13]]> +; FINAL-NEXT: WIDEN ir<%add> = add ir<%ld.b>, ir<%ld.a> +; FINAL-NEXT: CLONE ir<%ptr.c> = getelementptr inbounds ir<%c>, vp<%index> +; FINAL-NEXT: WIDEN store ir<%ptr.c>, ir<%add>, vp<[[VP13]]> +; FINAL-NEXT: EMIT vp<%index.next> = add vp<%index>, vp<[[VP5]]> +; FINAL-NEXT: EMIT vp<[[VP14:%[0-9]+]]> = icmp eq vp<%index.next>, vp<%n.vec> +; FINAL-NEXT: EMIT branch-on-cond vp<[[VP14]]> +; FINAL-NEXT: Successor(s): middle.block, vector.body +; FINAL-EMPTY: +; FINAL-NEXT: middle.block: +; FINAL-NEXT: Successor(s): ir-bb +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: No successors +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: Successor(s): ir-bb +; FINAL-EMPTY: +; FINAL-NEXT: ir-bb: +; FINAL-NEXT: IR %iv = phi i64 [ 0, %scalar.ph ], [ %iv.next, %for.body ] (extra operand: ir<0> from ir-bb) +; FINAL-NEXT: IR %ptr.a = getelementptr inbounds i8, ptr %a, i64 %iv +; FINAL-NEXT: IR %ld.a = load i8, ptr %ptr.a, align 1 +; FINAL-NEXT: IR %ptr.b = getelementptr inbounds i8, ptr %b, i64 %iv +; FINAL-NEXT: IR %ld.b = load i8, ptr %ptr.b, align 1 +; FINAL-NEXT: IR %add = add i8 %ld.b, %ld.a +; FINAL-NEXT: IR %ptr.c = getelementptr inbounds i8, ptr %c, i64 %iv +; FINAL-NEXT: IR store i8 %add, ptr %ptr.c, align 1 +; FINAL-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; FINAL-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, %n +; FINAL-NEXT: No successors +; FINAL-NEXT: } +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ptr.a = getelementptr inbounds i8, ptr %a, i64 %iv + %ld.a = load i8, ptr %ptr.a, align 1 + %ptr.b = getelementptr inbounds i8, ptr %b, i64 %iv + %ld.b = load i8, ptr %ptr.b, align 1 + %add = add i8 %ld.b, %ld.a + %ptr.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %add, ptr %ptr.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll index 22f8f23b26ef9..f1bdc7b0e016d 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll @@ -478,12 +478,12 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count ; CHECK-NEXT: vp<[[VP3:%[0-9]+]]> = original trip-count ; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060 -; CHECK-NEXT: IR %inc = add i64 %div, 1 -; CHECK-NEXT: EMIT vp<[[VP3]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) -; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) -; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[VP4:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) +; CHECK-NEXT: EMIT vp<[[VP3]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) +; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060 +; CHECK-NEXT: IR %inc = add i64 %div, 1 +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: vp<[[VP5:%[0-9]+]]> = DERIVED-IV ir<0> + vp<[[VP2]]> * vp<[[VP4]]> diff --git a/llvm/test/Transforms/LoopVectorize/alias-mask-data-tail-folding-style.ll b/llvm/test/Transforms/LoopVectorize/alias-mask-data-tail-folding-style.ll new file mode 100644 index 0000000000000..398f46a5d58cf --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/alias-mask-data-tail-folding-style.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -S -passes=loop-vectorize -force-target-supports-masked-memory-ops -force-partial-aliasing-vectorization -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data -force-vector-width=4 %s | FileCheck %s + +define void @test(ptr %src, ptr %dst, i32 %n) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoaddr ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoaddr ptr [[DST]] to i64 +; CHECK-NEXT: [[UMAX3:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1) +; CHECK-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1) +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[SCALAR_PH:.*]], label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[SRC2]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[DST1]] to ptr +; CHECK-NEXT: [[ALIAS_MASK:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP2]], ptr [[TMP3]], i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i1> [[ALIAS_MASK]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[NUM_ACTIVE_LANES]] to i32 +; CHECK-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i32 [[TMP7]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = sub i32 -1, [[UMAX3]] +; CHECK-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i32 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[UMAX3]], [[TMP10]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP7]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX1]], i32 [[UMAX3]]) +; CHECK-NEXT: [[MASK:%.*]] = and <4 x i1> [[ACTIVE_LANE_MASK]], [[ALIAS_MASK]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[INDEX1]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP12]], <4 x i1> [[MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], i32 [[INDEX1]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP13]], <4 x i1> [[MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX1]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_CLAMPED_VF_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + %gep.src = getelementptr i32, ptr %src, i32 %iv + %val = load i32, ptr %gep.src, align 4 + %gep.dst = getelementptr i32, ptr %dst, i32 %iv + store i32 %val, ptr %gep.dst, align 4 + %cond = icmp ult i32 %iv.next, %n + br i1 %cond, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/alias-mask-negative-tests.ll b/llvm/test/Transforms/LoopVectorize/alias-mask-negative-tests.ll new file mode 100644 index 0000000000000..2726b32911432 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/alias-mask-negative-tests.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -S -force-partial-aliasing-vectorization -force-target-supports-masked-memory-ops -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-width=4 -passes=loop-vectorize %s | FileCheck %s + +; Note: First order recurrences are not supported with alias-masking. +define i32 @first_order_recurrence(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) { +; CHECK-LABEL: define i32 @first_order_recurrence( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: br label %[[FOR_PREHEADER:.*]] +; CHECK: [[FOR_PREHEADER]]: +; CHECK-NEXT: [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[B1]], -4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD]], i32 3 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_MASKED_LOAD:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP7]], <4 x i1> [[TMP5]], <4 x i32> poison) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[WIDE_MASKED_LOAD]], [[TMP8]] +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP10]], ptr align 4 [[TMP9]], <4 x i1> [[TMP5]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[WIDE_MASKED_LOAD]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[VECTOR_RECUR]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[LAST_ACTIVE_LANE]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP15]], i32 [[TMP14]] +; CHECK-NEXT: br [[FOR_EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %for.preheader + +for.preheader: + %pre_load = load i32, ptr %a + br label %scalar.body + +scalar.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.next + %1 = load i32, ptr %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, ptr %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %scalar.body + +for.exit: + ret i32 %0 +} + +; This loop uses bounds checks (not diff checks), so can't use an alias mask. +define void @uses_bounds_checks(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define void @uses_bounds_checks( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[N]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT3]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP2]], <4 x i1> [[TMP1]], <4 x i8> poison) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> poison), !alias.scope [[META4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[WIDE_MASKED_LOAD4]] to <4 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP5]], ptr align 1 [[TMP6]], <4 x i1> [[TMP1]]), !alias.scope [[META7:![0-9]+]], !noalias [[META4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ptr.a = getelementptr inbounds nuw i8, ptr %a, i64 %iv + %load.a = load i8, ptr %ptr.a, align 1 + %ptr.b = getelementptr inbounds nuw i32, ptr %b, i64 %iv + %load.b = load i32, ptr %ptr.b, align 4 + %b.trunc = trunc i32 %load.b to i8 + %add = add i8 %b.trunc, %load.a + %ptr.c = getelementptr inbounds nuw i8, ptr %c, i64 %iv + store i8 %add, ptr %ptr.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/alias-mask.ll b/llvm/test/Transforms/LoopVectorize/alias-mask.ll new file mode 100644 index 0000000000000..6181bf962660a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/alias-mask.ll @@ -0,0 +1,494 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -S -force-partial-aliasing-vectorization -force-target-supports-masked-memory-ops -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-width=4 -passes=loop-vectorize %s | FileCheck %s +; RUN: opt -S -force-partial-aliasing-vectorization -force-target-supports-masked-memory-ops -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=2 -force-vector-width=4 -passes=loop-vectorize %s | FileCheck %s +; RUN: opt -S -force-partial-aliasing-vectorization -force-target-supports-masked-memory-ops -prefer-predicate-over-epilogue=predicate-dont-vectorize -epilogue-vectorization-force-VF=2 -force-vector-interleave=2 -force-vector-width=4 -passes=loop-vectorize %s | FileCheck %s + +; Note: -force-vector-interleave and -epilogue-vectorization-force-VF does not +; change the results as alias-masking is not supported with interleaving or +; epilogue vectorization. + +define void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define void @alias_mask( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP5]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP10]], <4 x i1> [[TMP9]], <4 x i8> poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP11]], <4 x i1> [[TMP9]], <4 x i8> poison) +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP9]], <4 x i8> [[WIDE_MASKED_LOAD]], <4 x i8> splat (i8 1) +; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i8> [[WIDE_MASKED_LOAD5]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP13]], ptr align 1 [[TMP14]], <4 x i1> [[TMP9]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %div = sdiv i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %div, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; Alias mask created via combining multiple dependence masks. +define void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define void @alias_mask_multiple( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A3:%.*]] = ptrtoaddr ptr [[A]] to i64 +; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[A3]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP3]], ptr [[TMP4]], i64 1) +; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i1> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i1> [[TMP6]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP9]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT4]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT5]], +; CHECK-NEXT: [[TMP12:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP12]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP14]], <4 x i1> [[TMP13]], <4 x i8> poison) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP15]], <4 x i1> [[TMP13]], <4 x i8> poison) +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i8> [[WIDE_MASKED_LOAD6]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP16]], ptr align 1 [[TMP17]], <4 x i1> [[TMP13]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %add = add i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %add, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; Alias masking + a simple add reduction. +define i32 @alias_mask_with_reduction(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define i32 @alias_mask_with_reduction( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP5]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP10]], <4 x i1> [[TMP9]], <4 x i8> poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP11]], <4 x i1> [[TMP9]], <4 x i8> poison) +; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP12]], ptr align 1 [[TMP13]], <4 x i1> [[TMP9]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> +; CHECK-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI]], [[TMP14]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP15]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %reduce = phi i32 [ 0, %entry ], [ %reduce.next, %for.body ] + %ptr.a = getelementptr inbounds nuw i8, ptr %a, i64 %iv + %ld.a = load i8, ptr %ptr.a, align 1 + %ptr.b = getelementptr inbounds nuw i8, ptr %b, i64 %iv + %ld.b = load i8, ptr %ptr.b, align 1 + %add = add i8 %ld.b, %ld.a + %ptr.c = getelementptr inbounds nuw i8, ptr %c, i64 %iv + store i8 %add, ptr %ptr.c, align 1 + %ext.add = zext i8 %add to i32 + %reduce.next = add nuw nsw i32 %reduce, %ext.add + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret i32 %reduce.next +} + +define void @alias_mask_non_default_address_space(ptr addrspace(1) noalias %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i64 %n) { +; CHECK-LABEL: define void @alias_mask_non_default_address_space( +; CHECK-SAME: ptr addrspace(1) noalias [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr addrspace(1) [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr addrspace(1) [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], [[EXIT:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP5]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 1 [[TMP10]], <4 x i1> [[TMP9]], <4 x i8> poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p1(ptr addrspace(1) align 1 [[TMP11]], <4 x i1> [[TMP9]], <4 x i8> poison) +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP9]], <4 x i8> [[WIDE_MASKED_LOAD]], <4 x i8> splat (i8 1) +; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i8> [[WIDE_MASKED_LOAD5]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.v4i8.p1(<4 x i8> [[TMP13]], ptr addrspace(1) align 1 [[TMP14]], <4 x i1> [[TMP9]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT_LOOPEXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %iv + %load.a = load i8, ptr addrspace(1) %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr addrspace(1) %b, i64 %iv + %load.b = load i8, ptr addrspace(1) %gep.b, align 1 + %div = sdiv i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr addrspace(1) %c, i64 %iv + store i8 %div, ptr addrspace(1) %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; Test alias mask with a known trip-count that would be one iteration of the full VF. +define void @alias_mask_known_trip_count(ptr noalias %a, ptr %b, ptr %c) { +; CHECK-LABEL: define void @alias_mask_known_trip_count( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoaddr ptr [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64 +; CHECK-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[C1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP0]], ptr [[TMP1]], i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 -8, [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 7, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 6) +; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP9]], <4 x i1> [[TMP8]], <4 x i8> poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i8> poison) +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i8> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP11]], ptr align 1 [[TMP12]], <4 x i1> [[TMP8]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ptr.a = getelementptr inbounds nuw i8, ptr %a, i64 %iv + %load.a = load i8, ptr %ptr.a, align 1 + %ptr.b = getelementptr inbounds nuw i8, ptr %b, i64 %iv + %load.b = load i8, ptr %ptr.b, align 1 + %add = add i8 %load.b, %load.a + %ptr.c = getelementptr inbounds nuw i8, ptr %c, i64 %iv + store i8 %add, ptr %ptr.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 7 + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !2 + +exit: + ret void +} + +; Tests `%div = sdiv i64 %iv, 64` (and its use for a load) is not considered +; to be uniform with partial alias masking. +define void @vf_dependent_uniform(ptr noalias %p, ptr %p.out, ptr %p.in, i64 %n) { +; CHECK-LABEL: define void @vf_dependent_uniform( +; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P_OUT:%.*]], ptr [[P_IN:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P_IN2:%.*]] = ptrtoaddr ptr [[P_IN]] to i64 +; CHECK-NEXT: [[P_OUT1:%.*]] = ptrtoaddr ptr [[P_OUT]] to i64 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: br label %[[VECTOR_CLAMPED_VF_CHECK:.*]] +; CHECK: [[VECTOR_CLAMPED_VF_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[P_IN2]] to ptr +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[P_OUT1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP0]], ptr [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[NUM_ACTIVE_LANES:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[NUM_ACTIVE_LANES]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VF_IS_SCALAR:%.*]] = icmp ule i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[SMAX]] +; CHECK-NEXT: [[VF_STEP_OVERFLOW:%.*]] = icmp ult i64 [[TMP5]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[VF_IS_SCALAR]], [[VF_STEP_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[SMAX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[NUM_ACTIVE_LANES]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[SMAX]], [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE10:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE10]] ] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = sdiv <4 x i64> [[VEC_IND]], splat (i64 64) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP15]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP9]], i32 1 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP19]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP20]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP22:%.*]] = phi <4 x i64> [ [[TMP16]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = load i64, ptr [[TMP25]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP26]], i32 2 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i64> [ [[TMP22]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3 +; CHECK-NEXT: br i1 [[TMP29]], label %[[PRED_LOAD_IF9:.*]], label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_IF9]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = load i64, ptr [[TMP31]], align 8 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i64> [[TMP28]], i64 [[TMP32]], i32 3 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_CONTINUE10]]: +; CHECK-NEXT: [[TMP34:%.*]] = phi <4 x i64> [ [[TMP28]], %[[PRED_LOAD_CONTINUE8]] ], [ [[TMP33]], %[[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[P_IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 [[TMP35]], <4 x i1> [[TMP9]], <4 x i64> poison) +; CHECK-NEXT: [[TMP36:%.*]] = add <4 x i64> [[TMP34]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP36]], ptr align 8 [[TMP37]], <4 x i1> [[TMP9]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[NUM_ACTIVE_LANES]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %div = sdiv i64 %iv, 64 + %gep = getelementptr inbounds i64, ptr %p, i64 %div + %ld = load i64, ptr %gep, align 8 + %gep2 = getelementptr inbounds i64, ptr %p.in, i64 %iv + %ld2 = load i64, ptr %gep2, align 8 + %val = add i64 %ld, %ld2 + %store.gep = getelementptr i64, ptr %p.out, i64 %iv + store i64 %val, ptr %store.gep, align 8 + %iv.next = add nsw i64 %iv, 1 + %exitcond = icmp slt i64 %iv.next, %n + br i1 %exitcond, label %loop, label %exit +exit: + ret void +} + +!2 = distinct !{!2, !3} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 736d8356d2219..1191bf5e018d7 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -436,8 +436,8 @@ define i64 @ivopt_widen_ptr_indvar_1(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_1( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -521,8 +521,8 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_2( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -626,8 +626,8 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_3( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -708,10 +708,10 @@ define void @strided_ptr_iv_runtime_stride(ptr %pIn, ptr %pOut, i32 %nCols, i32 ; STRIDED-NEXT: entry: ; STRIDED-NEXT: [[PIN2:%.*]] = ptrtoaddr ptr [[PIN:%.*]] to i64 ; STRIDED-NEXT: [[POUT1:%.*]] = ptrtoaddr ptr [[POUT:%.*]] to i64 -; STRIDED-NEXT: [[TMP0:%.*]] = zext i32 [[NCOLS:%.*]] to i64 -; STRIDED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1) ; STRIDED-NEXT: [[TMP1:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; STRIDED-NEXT: [[TMP2:%.*]] = shl nsw i64 [[TMP1]], 2 +; STRIDED-NEXT: [[TMP10:%.*]] = zext i32 [[NCOLS:%.*]] to i64 +; STRIDED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP10]], i64 1) ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; STRIDED: vector.scevcheck: diff --git a/llvm/test/Transforms/LoopVectorize/remove-redundant-trip-count-scev.ll b/llvm/test/Transforms/LoopVectorize/remove-redundant-trip-count-scev.ll new file mode 100644 index 0000000000000..7aeb4cf5deb8a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/remove-redundant-trip-count-scev.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -force-target-supports-masked-memory-ops -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + +define void @test(ptr %base_a, ptr %base_b, i32 %ntypes) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[BASE_A:%.*]], ptr [[BASE_B:%.*]], i32 [[NTYPES:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[NTYPES]], 0 +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[LOOP_PREHEADER]]: +; CHECK-NEXT: [[N:%.*]] = zext nneg i32 [[NTYPES]] to i64 +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <4 x i64> , [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[N]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i64> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP20]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IND:%.*]] = add <4 x i64> [[BROADCAST_SPLAT6]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x i8], ptr [[BASE_A]], <4 x i64> [[VEC_IND1]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [8 x i8], ptr [[BASE_B]], i64 [[TMP20]] +; CHECK-NEXT: call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[TMP3]], ptr align 8 [[TMP23]], <4 x i1> [[TMP0]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP20]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND1]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %cmp = icmp sgt i32 %ntypes, 0 + br i1 %cmp, label %loop.preheader, label %exit + +loop.preheader: + %n = zext nneg i32 %ntypes to i64 + br label %loop + +loop: + %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ] + %offset = phi i64 [ 0, %loop.preheader ], [ %offset.next, %loop ] + %gep_a = getelementptr inbounds [4 x i8], ptr %base_a, i64 %offset + %gep_b = getelementptr inbounds [8 x i8], ptr %base_b, i64 %iv + store ptr %gep_a, ptr %gep_b, align 8 + %offset.next = add nuw nsw i64 %offset, %n + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index 55c73cb0928ff..c97fc36ac76d1 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -205,10 +205,15 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_2_PREHEADER]]: ; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 2, [[STEP]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP0]] +; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]] ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[STEP]], -2 -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]] ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1) @@ -217,11 +222,6 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP16:%.*]] = sub i32 2, [[STEP]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]] -; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; CHECK: [[VECTOR_SCEVCHECK]]: