From 2532eb7fa763deb6e0909f4737d447c55b9f5a55 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 22 Feb 2025 19:15:32 +0000 Subject: [PATCH 1/3] [VPlan] Move predication to VPlanTransform (NFC) (WIP). This patch moves the logic to predicate and linearize a VPlan to a dedicated VPlan transform. The main logic to perform predication is ready to review, although there are few things to note that should be improved, either directly in the PR or in the future: * Edge and block masks are cached in VPRecipeBuilder, so they can be accessed during recipe construction. A better alternative may be to add mask operands to all VPInstructions that need them and use that during recipe construction * The mask caching in a map also means that this map needs updating each time a new recipe replaces a VPInstruction; this would also be handled by adding mask operands. Currently this is still WIP due to early-exit loop handling not working due to the exit conditions not being available in the initial VPlans. This will be fixed with https://github.com/llvm/llvm-project/pull/128419 and follow-ups All tests except early-exit loops are passing --- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../Transforms/Vectorize/LoopVectorize.cpp | 293 ++--------------- .../Transforms/Vectorize/VPRecipeBuilder.h | 63 +--- .../Vectorize/VPlanConstruction.cpp | 16 +- .../Transforms/Vectorize/VPlanPredicator.cpp | 298 ++++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 14 +- .../Transforms/Vectorize/VPlanTestBase.h | 3 +- 7 files changed, 357 insertions(+), 331 deletions(-) create mode 100644 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 0dc6a7d2f594f..e6c7142edd100 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -24,6 +24,7 @@ add_llvm_component_library(LLVMVectorize VPlan.cpp VPlanAnalysis.cpp VPlanConstruction.cpp + VPlanPredicator.cpp VPlanRecipes.cpp VPlanSLP.cpp VPlanTransforms.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1611c6d3a4437..0925b71d5358c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8216,185 +8216,6 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { }); } -void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { - BasicBlock *Src = SI->getParent(); - assert(!OrigLoop->isLoopExiting(Src) && - all_of(successors(Src), - [this](BasicBlock *Succ) { - return OrigLoop->getHeader() != Succ; - }) && - "unsupported switch either exiting loop or continuing to header"); - // Create masks where the terminator in Src is a switch. We create mask for - // all edges at the same time. This is more efficient, as we can create and - // collect compares for all cases once. - VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition()); - BasicBlock *DefaultDst = SI->getDefaultDest(); - MapVector> Dst2Compares; - for (auto &C : SI->cases()) { - BasicBlock *Dst = C.getCaseSuccessor(); - assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); - // Cases whose destination is the same as default are redundant and can be - // ignored - they will get there anyhow. - if (Dst == DefaultDst) - continue; - auto &Compares = Dst2Compares[Dst]; - VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue()); - Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); - } - - // We need to handle 2 separate cases below for all entries in Dst2Compares, - // which excludes destinations matching the default destination. - VPValue *SrcMask = getBlockInMask(Src); - VPValue *DefaultMask = nullptr; - for (const auto &[Dst, Conds] : Dst2Compares) { - // 1. Dst is not the default destination. Dst is reached if any of the cases - // with destination == Dst are taken. Join the conditions for each case - // whose destination == Dst using an OR. - VPValue *Mask = Conds[0]; - for (VPValue *V : ArrayRef(Conds).drop_front()) - Mask = Builder.createOr(Mask, V); - if (SrcMask) - Mask = Builder.createLogicalAnd(SrcMask, Mask); - EdgeMaskCache[{Src, Dst}] = Mask; - - // 2. Create the mask for the default destination, which is reached if none - // of the cases with destination != default destination are taken. Join the - // conditions for each case where the destination is != Dst using an OR and - // negate it. - DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; - } - - if (DefaultMask) { - DefaultMask = Builder.createNot(DefaultMask); - if (SrcMask) - DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); - } - EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; -} - -VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { - assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); - - // Look for cached value. - std::pair Edge(Src, Dst); - EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); - if (ECEntryIt != EdgeMaskCache.end()) - return ECEntryIt->second; - - if (auto *SI = dyn_cast(Src->getTerminator())) { - createSwitchEdgeMasks(SI); - assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?"); - return EdgeMaskCache[Edge]; - } - - VPValue *SrcMask = getBlockInMask(Src); - - // The terminator has to be a branch inst! - BranchInst *BI = dyn_cast(Src->getTerminator()); - assert(BI && "Unexpected terminator found"); - if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) - return EdgeMaskCache[Edge] = SrcMask; - - // If source is an exiting block, we know the exit edge is dynamically dead - // in the vector loop, and thus we don't need to restrict the mask. Avoid - // adding uses of an otherwise potentially dead instruction unless we are - // vectorizing a loop with uncountable exits. In that case, we always - // materialize the mask. - if (OrigLoop->isLoopExiting(Src) && - Src != Legal->getUncountableEarlyExitingBlock()) - return EdgeMaskCache[Edge] = SrcMask; - - VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); - assert(EdgeMask && "No Edge Mask found for condition"); - - if (BI->getSuccessor(0) != Dst) - EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); - - if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. - // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask - // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' - // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. - EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); - } - - return EdgeMaskCache[Edge] = EdgeMask; -} - -VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { - assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); - - // Look for cached value. - std::pair Edge(Src, Dst); - EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); - assert(ECEntryIt != EdgeMaskCache.end() && - "looking up mask for edge which has not been created"); - return ECEntryIt->second; -} - -void VPRecipeBuilder::createHeaderMask() { - BasicBlock *Header = OrigLoop->getHeader(); - - // When not folding the tail, use nullptr to model all-true mask. - if (!CM.foldTailByMasking()) { - BlockMaskCache[Header] = nullptr; - return; - } - - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, NewInsertionPoint); - - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - VPValue *BlockMask = nullptr; - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); - BlockMaskCache[Header] = BlockMask; -} - -VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { - // Return the cached value. - BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); - assert(BCEntryIt != BlockMaskCache.end() && - "Trying to access mask for block without one."); - return BCEntryIt->second; -} - -void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { - assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); - assert(OrigLoop->getHeader() != BB && - "Loop header must have cached block mask"); - - // All-one mask is modelled as no-mask following the convention for masked - // load/store/gather/scatter. Initialize BlockMask to no-mask. - VPValue *BlockMask = nullptr; - // This is the block mask. We OR all unique incoming edges. - for (auto *Predecessor : - SetVector(llvm::from_range, predecessors(BB))) { - VPValue *EdgeMask = createEdgeMask(Predecessor, BB); - if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. - BlockMaskCache[BB] = EdgeMask; - return; - } - - if (!BlockMask) { // BlockMask has its initialized nullptr value. - BlockMask = EdgeMask; - continue; - } - - BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); - } - - BlockMaskCache[BB] = BlockMask; -} - VPWidenMemoryRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range) { @@ -8539,31 +8360,6 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( return nullptr; } -VPBlendRecipe *VPRecipeBuilder::tryToBlend(VPWidenPHIRecipe *PhiR) { - // We know that all PHIs in non-header blocks are converted into selects, so - // we don't have to worry about the insertion order and we can just use the - // builder. At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = PhiR->getNumIncoming(); - SmallVector OperandsWithMask; - for (unsigned In = 0; In < NumIncoming; In++) { - OperandsWithMask.push_back(PhiR->getIncomingValue(In)); - const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); - VPValue *EdgeMask = getEdgeMask(Pred, PhiR->getParent()); - if (!EdgeMask) { - assert(In == 0 && "Both null and non-null edge masks found"); - assert(all_equal(PhiR->operands()) && - "Distinct incoming values with one having a full mask"); - break; - } - OperandsWithMask.push_back(EdgeMask); - } - return new VPBlendRecipe(cast(PhiR->getUnderlyingInstr()), - OperandsWithMask); -} - VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef Operands, VFRange &Range) { @@ -8958,10 +8754,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (auto *PhiR = dyn_cast(R)) { VPBasicBlock *Parent = PhiR->getParent(); VPRegionBlock *LoopRegionOf = Parent->getEnclosingLoopRegion(); - // Handle phis in non-header blocks. - if (!LoopRegionOf || LoopRegionOf->getEntry() != Parent) - return tryToBlend(PhiR); - + assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent && + "Non-header phis should have been handled during predication"); auto *Phi = cast(R->getUnderlyingInstr()); assert(Operands.size() == 2 && "Must have 2 operands for header phis"); if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) @@ -9378,8 +9172,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - DenseMap VPB2IRBB; - auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB); + auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); VPlanTransforms::prepareForVectorization( *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck, CM.foldTailByMasking(), OrigLoop, @@ -9412,9 +9205,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, cast(IVInc)->dropPoisonGeneratingFlags(); } - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, - Builder, VPB2IRBB, LVer); - // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further // process after constructing the initial VPlan. @@ -9442,43 +9232,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, } // --------------------------------------------------------------------------- - // Construct recipes for the instructions in the loop + // Predicate and linearize the top-level loop region. // --------------------------------------------------------------------------- + DenseMap BlockMaskCache; + VPlanTransforms::predicateAndLinearize(*Plan, CM.foldTailByMasking(), + BlockMaskCache); - VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); - VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); - BasicBlock *HeaderBB = OrigLoop->getHeader(); - bool NeedsMasks = - CM.foldTailByMasking() || - any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { - bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); - return Legal->blockNeedsPredication(BB) || NeedsBlends; - }); - + // --------------------------------------------------------------------------- + // Construct recipes for the instructions in the loop + // --------------------------------------------------------------------------- + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, + Builder, BlockMaskCache, LVer); RecipeBuilder.collectScaledReductions(Range); - auto *MiddleVPBB = Plan->getMiddleBlock(); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. + VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); ReversePostOrderTraversal> RPOT( HeaderVPBB); + auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { - // Create mask based on the IR BB corresponding to VPBB. - // TODO: Predicate directly based on VPlan. - Builder.setInsertPoint(VPBB, VPBB->begin()); - if (VPBB == HeaderVPBB) { - Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi()); - RecipeBuilder.createHeaderMask(); - } else if (NeedsMasks) { - // FIXME: At the moment, masks need to be placed at the beginning of the - // block, as blends introduced for phi nodes need to use it. The created - // blends should be sunk after the mask recipes. - RecipeBuilder.createBlockInMask(VPBB); - } - // Convert input VPInstructions to widened recipes. for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *SingleDef = cast(&R); @@ -9488,7 +9264,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // latter are added above for masking. // FIXME: Migrate code relying on the underlying instruction from VPlan0 // to construct recipes below to not use the underlying instruction. - if (isa(&R) || + if (isa( + &R) || (isa(&R) && !UnderlyingValue)) continue; @@ -9497,14 +9274,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, assert((isa(&R) || isa(&R)) && UnderlyingValue && "unsupported recipe"); - if (isa(&R) && - (cast(&R)->getOpcode() == - VPInstruction::BranchOnCond || - (cast(&R)->getOpcode() == Instruction::Switch))) { - R.eraseFromParent(); - break; - } - // TODO: Gradually replace uses of underlying instruction by analyses on // VPlan. Instruction *Instr = cast(UnderlyingValue); @@ -9542,27 +9311,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, } else { Builder.insert(Recipe); } - if (Recipe->getNumDefinedValues() == 1) + if (Recipe->getNumDefinedValues() == 1) { SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue()); - else + // replaceAllUsesWith may invalidate the block mask cache. Update it. + // TODO: Include the masks as operands in the predicated VPlan directly + // to remove the need to keep a map of masks beyond the predication + // transform. + RecipeBuilder.updateBlockMaskCache(SingleDef, + Recipe->getVPSingleValue()); + } else assert(Recipe->getNumDefinedValues() == 0 && "Unexpected multidef recipe"); R.eraseFromParent(); } } - VPBlockBase *PrevVPBB = nullptr; - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { - // Flatten the CFG in the loop. Masks for blocks have already been generated - // and added to recipes as needed. To do so, first disconnect VPBB from its - // successors. Then connect VPBB to the previously visited VPBB. - for (auto *Succ : to_vector(VPBB->getSuccessors())) - VPBlockUtils::disconnectBlocks(VPBB, Succ); - if (PrevVPBB) - VPBlockUtils::connectBlocks(PrevVPBB, VPBB); - PrevVPBB = VPBB; - } - assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " @@ -9679,8 +9442,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - DenseMap VPB2IRBB; - auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB); + auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); VPlanTransforms::prepareForVectorization( *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop, getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false, @@ -9700,8 +9462,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { // Collect mapping of IR header phis to header phi recipes, to be used in // addScalarResumePhis. + DenseMap BlockMaskCache; VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, - Builder, VPB2IRBB, nullptr /*LVer*/); + Builder, BlockMaskCache, nullptr /*LVer*/); for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { if (isa(&R)) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index ae86181487261..f24429328ede0 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -68,15 +68,7 @@ class VPRecipeBuilder { VPBuilder &Builder; - /// When we if-convert we need to create edge masks. We have to cache values - /// so that we don't end up with exponential recursion/IR. Note that - /// if-conversion currently takes place during VPlan-construction, so these - /// caches are only used at that stage. - using EdgeMaskCacheTy = - DenseMap, VPValue *>; - using BlockMaskCacheTy = DenseMap; - EdgeMaskCacheTy EdgeMaskCache; - BlockMaskCacheTy BlockMaskCache; + DenseMap &BlockMaskCache; // VPlan construction support: Hold a mapping from ingredients to // their recipe. @@ -90,10 +82,6 @@ class VPRecipeBuilder { /// A mapping of partial reduction exit instructions to their scaling factor. DenseMap ScaledReductionMap; - /// A mapping from VP blocks to IR blocks, used temporarily while migrating - /// away from IR references. - const DenseMap &VPB2IRBB; - /// Loop versioning instance for getting noalias metadata guaranteed by /// runtime checks. LoopVersioning *LVer; @@ -122,11 +110,6 @@ class VPRecipeBuilder { tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef Operands, VFRange &Range); - /// Handle non-loop phi nodes, returning a new VPBlendRecipe. Currently - /// all such phi nodes are turned into a sequence of select instructions as - /// the vectorizer currently performs full if-conversion. - VPBlendRecipe *tryToBlend(VPWidenPHIRecipe *PhiR); - /// Handle call instructions. If \p CI can be widened for \p Range.Start, /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be /// decreased to ensure same decision from \p Range.Start to \p Range.End. @@ -164,10 +147,11 @@ class VPRecipeBuilder { LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, PredicatedScalarEvolution &PSE, VPBuilder &Builder, - const DenseMap &VPB2IRBB, + DenseMap &BlockMaskCache, LoopVersioning *LVer) : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), - CM(CM), PSE(PSE), Builder(Builder), VPB2IRBB(VPB2IRBB), LVer(LVer) {} + CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache), + LVer(LVer) {} std::optional getScalingForReduction(const Instruction *ExitInst) { auto It = ScaledReductionMap.find(ExitInst); @@ -196,38 +180,10 @@ class VPRecipeBuilder { Ingredient2Recipe[I] = R; } - /// Create the mask for the vector loop header block. - void createHeaderMask(); - - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True or the loop mask when - /// tail folding. - void createBlockInMask(const VPBasicBlock *VPBB) { - return createBlockInMask(VPB2IRBB.lookup(VPBB)); - } - void createBlockInMask(BasicBlock *BB); - - /// Returns the *entry* mask for the block \p VPBB. - VPValue *getBlockInMask(const VPBasicBlock *VPBB) const { - return getBlockInMask(VPB2IRBB.lookup(VPBB)); - } - /// Returns the *entry* mask for the block \p BB. - VPValue *getBlockInMask(BasicBlock *BB) const; - - /// Create an edge mask for every destination of cases and/or default. - void createSwitchEdgeMasks(SwitchInst *SI); - - /// A helper function that computes the predicate of the edge between SRC - /// and DST. - VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - - /// A helper that returns the previously computed predicate of the edge - /// between SRC and DST. - VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { - return getEdgeMask(VPB2IRBB.lookup(Src), VPB2IRBB.lookup(Dst)); + VPValue *getBlockInMask(VPBasicBlock *VPBB) const { + return BlockMaskCache.lookup(VPBB); } - VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const; /// Return the recipe created for given ingredient. VPRecipeBase *getRecipe(Instruction *I) { @@ -252,6 +208,13 @@ class VPRecipeBuilder { } return Plan.getOrAddLiveIn(V); } + + void updateBlockMaskCache(VPValue *Old, VPValue *New) { + for (auto &[_, V] : BlockMaskCache) { + if (V == Old) + V = New; + } + } }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 287bc93ce496a..92bd49ace3638 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -66,8 +66,7 @@ class PlainCFGBuilder { : TheLoop(Lp), LI(LI), Plan(std::make_unique(Lp)) {} /// Build plain CFG for TheLoop and connects it to Plan's entry. - std::unique_ptr - buildPlainCFG(DenseMap &VPB2IRBB); + std::unique_ptr buildPlainCFG(); }; } // anonymous namespace @@ -242,8 +241,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } // Main interface to build the plain CFG. -std::unique_ptr PlainCFGBuilder::buildPlainCFG( - DenseMap &VPB2IRBB) { +std::unique_ptr PlainCFGBuilder::buildPlainCFG() { VPIRBasicBlock *Entry = cast(Plan->getEntry()); BB2VPBB[Entry->getIRBasicBlock()] = Entry; for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks()) @@ -334,18 +332,14 @@ std::unique_ptr PlainCFGBuilder::buildPlainCFG( } } - for (const auto &[IRBB, VPB] : BB2VPBB) - VPB2IRBB[VPB] = IRBB; - LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan); return std::move(Plan); } -std::unique_ptr VPlanTransforms::buildPlainCFG( - Loop *TheLoop, LoopInfo &LI, - DenseMap &VPB2IRBB) { +std::unique_ptr VPlanTransforms::buildPlainCFG(Loop *TheLoop, + LoopInfo &LI) { PlainCFGBuilder Builder(TheLoop, &LI); - return Builder.buildPlainCFG(VPB2IRBB); + return Builder.buildPlainCFG(); } /// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp new file mode 100644 index 0000000000000..3f695b4eb05bb --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -0,0 +1,298 @@ +//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements predication for VPlans. +/// +//===----------------------------------------------------------------------===// + +#include "VPRecipeBuilder.h" +#include "VPlan.h" +#include "VPlanCFG.h" +#include "VPlanTransforms.h" +#include "VPlanUtils.h" +#include "llvm/ADT/PostOrderIterator.h" + +using namespace llvm; + +namespace { +struct VPPredicator { + using BlockMaskCacheTy = DenseMap; + VPPredicator(BlockMaskCacheTy &BlockMaskCache) + : BlockMaskCache(BlockMaskCache) {} + + /// Builder to construct recipes to compute masks. + VPBuilder Builder; + + /// When we if-convert we need to create edge masks. We have to cache values + /// so that we don't end up with exponential recursion/IR. + using EdgeMaskCacheTy = + DenseMap, + VPValue *>; + EdgeMaskCacheTy EdgeMaskCache; + + BlockMaskCacheTy &BlockMaskCache; + + /// Returns the previously computed predicate of the edge between \p Src and + /// \p Dst. + VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { + return EdgeMaskCache.lookup({Src, Dst}); + } + + /// Returns the *entry* mask for \p VPBB. + VPValue *getBlockInMask(VPBasicBlock *VPBB) const { + return BlockMaskCache.lookup(VPBB); + } + void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) { + // TODO: Include the masks as operands in the predicated VPlan directly to + // remove the need to keep a map of masks beyond the predication transform. + assert(!BlockMaskCache.contains(VPBB) && "Mask already set"); + BlockMaskCache[VPBB] = Mask; + } + + /// Compute and return the mask for the vector loop header block. + void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail); + + /// Compute and return the predicate of \p VPBB, assuming that the header + /// block of the loop is set to True or the loop mask when tail folding. + VPValue *createBlockInMask(VPBasicBlock *VPBB); + + /// Computes and return the predicate of the edge between \p Src and \p Dst. + VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); + + /// Create an edge mask for every destination of cases and/or default. + void createSwitchEdgeMasks(VPInstruction *SI); +}; +} // namespace + +VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) { + assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge"); + + // Look for cached value. + VPValue *EdgeMask = getEdgeMask(Src, Dst); + if (EdgeMask) + return EdgeMask; + + VPValue *SrcMask = getBlockInMask(Src); + + // The terminator has to be a branch inst! + if (Src->empty() || Src->getNumSuccessors() == 1) { + EdgeMaskCache[{Src, Dst}] = SrcMask; + return SrcMask; + } + + auto *Term = cast(Src->getTerminator()); + if (Term->getOpcode() == Instruction::Switch) { + createSwitchEdgeMasks(Term); + return getEdgeMask(Src, Dst); + } + + auto *BI = cast(Src->getTerminator()); + assert(BI->getOpcode() == VPInstruction::BranchOnCond); + if (Src->getSuccessors()[0] == Src->getSuccessors()[1]) { + EdgeMaskCache[{Src, Dst}] = SrcMask; + return SrcMask; + } + + EdgeMask = BI->getOperand(0); + assert(EdgeMask && "No Edge Mask found for condition"); + + if (Src->getSuccessors()[0] != Dst) + EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); + + if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. + // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask + // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' + // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. + EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); + } + + EdgeMaskCache[{Src, Dst}] = EdgeMask; + return EdgeMask; +} + +VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) { + Builder.setInsertPoint(VPBB, VPBB->begin()); + // All-one mask is modelled as no-mask following the convention for masked + // load/store/gather/scatter. Initialize BlockMask to no-mask. + VPValue *BlockMask = nullptr; + // This is the block mask. We OR all unique incoming edges. + for (auto *Predecessor : SetVector( + VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) { + VPValue *EdgeMask = createEdgeMask(cast(Predecessor), VPBB); + if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is + // too. + setBlockInMask(VPBB, EdgeMask); + return EdgeMask; + } + + if (!BlockMask) { // BlockMask has its initialized nullptr value. + BlockMask = EdgeMask; + continue; + } + + BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); + } + + setBlockInMask(VPBB, BlockMask); + return BlockMask; +} + +void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { + if (!FoldTail) { + setBlockInMask(HeaderVPBB, nullptr); + return; + } + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto &Plan = *HeaderVPBB->getPlan(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + HeaderVPBB->insert(IV, NewInsertionPoint); + + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + VPValue *BlockMask = nullptr; + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + setBlockInMask(HeaderVPBB, BlockMask); +} + +void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { + VPBasicBlock *Src = SI->getParent(); + + // Create masks where the terminator in Src is a switch. We create mask for + // all edges at the same time. This is more efficient, as we can create and + // collect compares for all cases once. + VPValue *Cond = SI->getOperand(0); + VPBasicBlock *DefaultDst = cast(Src->getSuccessors()[0]); + MapVector> Dst2Compares; + for (const auto &[Idx, Succ] : + enumerate(ArrayRef(Src->getSuccessors()).drop_front())) { + VPBasicBlock *Dst = cast(Succ); + assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); + // Cases whose destination is the same as default are redundant and can + // be ignored - they will get there anyhow. + if (Dst == DefaultDst) + continue; + auto &Compares = Dst2Compares[Dst]; + VPValue *V = SI->getOperand(Idx + 1); + Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); + } + + // We need to handle 2 separate cases below for all entries in Dst2Compares, + // which excludes destinations matching the default destination. + VPValue *SrcMask = getBlockInMask(Src); + VPValue *DefaultMask = nullptr; + for (const auto &[Dst, Conds] : Dst2Compares) { + // 1. Dst is not the default destination. Dst is reached if any of the + // cases with destination == Dst are taken. Join the conditions for each + // case whose destination == Dst using an OR. + VPValue *Mask = Conds[0]; + for (VPValue *V : ArrayRef(Conds).drop_front()) + Mask = Builder.createOr(Mask, V); + if (SrcMask) + Mask = Builder.createLogicalAnd(SrcMask, Mask); + EdgeMaskCache[{Src, Dst}] = Mask; + + // 2. Create the mask for the default destination, which is reached if + // none of the cases with destination != default destination are taken. + // Join the conditions for each case where the destination is != Dst using + // an OR and negate it. + DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; + } + + if (DefaultMask) { + DefaultMask = Builder.createNot(DefaultMask); + if (SrcMask) + DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); + } + EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; +} + +void VPlanTransforms::predicateAndLinearize( + VPlan &Plan, bool FoldTail, + DenseMap &BlockMaskCache) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + // Scan the body of the loop in a topological order to visit each basic block + // after having visited its predecessor basic blocks. + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); + ReversePostOrderTraversal> RPOT( + Header); + VPPredicator Predicator(BlockMaskCache); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + if (VPBB == Header) { + Predicator.createHeaderMask(Header, FoldTail); + continue; + } + + SmallVector Phis; + for (VPRecipeBase &R : VPBB->phis()) + Phis.push_back(cast(&R)); + + Predicator.createBlockInMask(VPBB); + + for (VPWidenPHIRecipe *Phi : Phis) { + PHINode *IRPhi = cast(Phi->getUnderlyingValue()); + + unsigned NumIncoming = IRPhi->getNumIncomingValues(); + + // We know that all PHIs in non-header blocks are converted into selects, + // so we don't have to worry about the insertion order and we can just use + // the builder. At this point we generate the predication tree. There may + // be duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + SmallVector OperandsWithMask; + for (unsigned In = 0; In < NumIncoming; In++) { + const VPBasicBlock *Pred = Phi->getIncomingBlock(In); + OperandsWithMask.push_back(Phi->getIncomingValue(In)); + VPValue *EdgeMask = Predicator.getEdgeMask(Pred, VPBB); + if (!EdgeMask) { + assert(In == 0 && "Both null and non-null edge masks found"); + assert(all_equal(Phi->operands()) && + "Distinct incoming values with one having a full mask"); + break; + } + OperandsWithMask.push_back(EdgeMask); + } + auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask); + Blend->insertBefore(Phi); + Phi->replaceAllUsesWith(Blend); + Phi->eraseFromParent(); + } + } + + VPBlockBase *PrevVPBB = nullptr; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + // Handle VPBBs down to the latch. + if (PrevVPBB && VPBB == LoopRegion->getExiting()) { + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + break; + } + + auto Successors = to_vector(VPBB->getSuccessors()); + if (Successors.size() > 1) + VPBB->getTerminator()->eraseFromParent(); + + // Flatten the CFG in the loop. Masks for blocks have already been + // generated and added to recipes as needed. To do so, first disconnect + // VPBB from its successors. Then connect VPBB to the previously visited + // VPBB. + for (auto *Succ : Successors) + VPBlockUtils::disconnectBlocks(VPBB, Succ); + if (PrevVPBB) + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + + PrevVPBB = VPBB; + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index d284d916633c8..25a2a03c71d00 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -53,9 +53,7 @@ struct VPlanTransforms { verifyVPlanIsValid(Plan); } - static std::unique_ptr - buildPlainCFG(Loop *TheLoop, LoopInfo &LI, - DenseMap &VPB2IRBB); + static std::unique_ptr buildPlainCFG(Loop *TheLoop, LoopInfo &LI); /// Prepare the plan for vectorization. It will introduce a dedicated /// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit @@ -217,6 +215,16 @@ struct VPlanTransforms { /// candidates. static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth); + + /// Predicate and linearize the control-flow in the top-level loop region of + /// \p Plan. If \p FoldTail is true, also create a mask guarding the loop + /// header, otherwise use all-true for the header mask. Masks for blocks are + /// added to \p BlockMaskCache, which in turn is temporarily used for wide + /// recipe construction. This argument is temporary and will be removed in the + /// future. + static void + predicateAndLinearize(VPlan &Plan, bool FoldTail, + DenseMap &BlockMaskCache); }; } // namespace llvm diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index 2a15e907e5fa5..e2ad65b93e3dd 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -71,8 +71,7 @@ class VPlanTestIRBase : public testing::Test { Loop *L = LI->getLoopFor(LoopHeader); PredicatedScalarEvolution PSE(*SE, *L); - DenseMap VPB2IRBB; - auto Plan = VPlanTransforms::buildPlainCFG(L, *LI, VPB2IRBB); + auto Plan = VPlanTransforms::buildPlainCFG(L, *LI); VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2)); VPlanTransforms::prepareForVectorization(*Plan, IntegerType::get(*Ctx, 64), PSE, true, false, L, {}, false, R); From 58c8fc4992731e6f56a9af3aab9cef6f8d2f0a2c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 16 May 2025 11:28:41 +0100 Subject: [PATCH 2/3] !fixup address comments, thanks --- .../Transforms/Vectorize/LoopVectorize.cpp | 23 ++-- .../Transforms/Vectorize/VPRecipeBuilder.h | 6 +- .../Transforms/Vectorize/VPlanPredicator.cpp | 119 +++++++++--------- .../Transforms/Vectorize/VPlanTransforms.h | 8 +- 4 files changed, 84 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 317573790288a..00b5b81cc6c96 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9254,6 +9254,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); + // Mapping from VPValues in the initial plan to their widened VPValues. Needed + // temporarily to update created block masks. + DenseMap Old2New; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { // Convert input VPInstructions to widened recipes. for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { @@ -9313,19 +9316,23 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, } if (Recipe->getNumDefinedValues() == 1) { SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue()); - // replaceAllUsesWith may invalidate the block mask cache. Update it. - // TODO: Include the masks as operands in the predicated VPlan directly - // to remove the need to keep a map of masks beyond the predication - // transform. - RecipeBuilder.updateBlockMaskCache(SingleDef, - Recipe->getVPSingleValue()); - } else + Old2New[SingleDef] = Recipe->getVPSingleValue(); + } else { assert(Recipe->getNumDefinedValues() == 0 && "Unexpected multidef recipe"); - R.eraseFromParent(); + R.eraseFromParent(); + } } } + // replaceAllUsesWith above may invalidate the block masks. Update them here. + // TODO: Include the masks as operands in the predicated VPlan directly + // to remove the need to keep a map of masks beyond the predication + // transform. + RecipeBuilder.updateBlockMaskCache(Old2New); + for (const auto &[Old, New] : Old2New) + Old->getDefiningRecipe()->eraseFromParent(); + assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index f24429328ede0..264b1ea3deb97 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -209,10 +209,12 @@ class VPRecipeBuilder { return Plan.getOrAddLiveIn(V); } - void updateBlockMaskCache(VPValue *Old, VPValue *New) { + void updateBlockMaskCache(const DenseMap &Old2New) { for (auto &[_, V] : BlockMaskCache) { - if (V == Old) + if (auto *New = Old2New.lookup(V)) { + V->replaceAllUsesWith(New); V = New; + } } } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 3f695b4eb05bb..e0e0509353639 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -21,11 +21,8 @@ using namespace llvm; namespace { -struct VPPredicator { +class VPPredicator { using BlockMaskCacheTy = DenseMap; - VPPredicator(BlockMaskCacheTy &BlockMaskCache) - : BlockMaskCache(BlockMaskCache) {} - /// Builder to construct recipes to compute masks. VPBuilder Builder; @@ -38,35 +35,45 @@ struct VPPredicator { BlockMaskCacheTy &BlockMaskCache; - /// Returns the previously computed predicate of the edge between \p Src and - /// \p Dst. - VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { - return EdgeMaskCache.lookup({Src, Dst}); - } + /// Create an edge mask for every destination of cases and/or default. + void createSwitchEdgeMasks(VPInstruction *SI); + + /// Computes and return the predicate of the edge between \p Src and \p Dst. + VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); /// Returns the *entry* mask for \p VPBB. VPValue *getBlockInMask(VPBasicBlock *VPBB) const { return BlockMaskCache.lookup(VPBB); } + void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) { // TODO: Include the masks as operands in the predicated VPlan directly to // remove the need to keep a map of masks beyond the predication transform. - assert(!BlockMaskCache.contains(VPBB) && "Mask already set"); + assert(!getBlockInMask(VPBB) && "Mask already set"); BlockMaskCache[VPBB] = Mask; } + VPValue *setEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst, + VPValue *Mask) { + assert(!getEdgeMask(Src, Dst) && "Mask already set"); + return EdgeMaskCache[{Src, Dst}] = Mask; + } + +public: + VPPredicator(BlockMaskCacheTy &BlockMaskCache) + : BlockMaskCache(BlockMaskCache) {} + + /// Returns the precomputed predicate of the edge from \p Src to \p Dst. + VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { + return EdgeMaskCache.lookup({Src, Dst}); + } + /// Compute and return the mask for the vector loop header block. void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail); /// Compute and return the predicate of \p VPBB, assuming that the header /// block of the loop is set to True or the loop mask when tail folding. VPValue *createBlockInMask(VPBasicBlock *VPBB); - - /// Computes and return the predicate of the edge between \p Src and \p Dst. - VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); - - /// Create an edge mask for every destination of cases and/or default. - void createSwitchEdgeMasks(VPInstruction *SI); }; } // namespace @@ -80,11 +87,9 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) { VPValue *SrcMask = getBlockInMask(Src); - // The terminator has to be a branch inst! - if (Src->empty() || Src->getNumSuccessors() == 1) { - EdgeMaskCache[{Src, Dst}] = SrcMask; - return SrcMask; - } + // If there's a single successor, there's no terminator recipe. + if (Src->getNumSuccessors() == 1) + return setEdgeMask(Src, Dst, SrcMask); auto *Term = cast(Src->getTerminator()); if (Term->getOpcode() == Instruction::Switch) { @@ -92,28 +97,25 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) { return getEdgeMask(Src, Dst); } - auto *BI = cast(Src->getTerminator()); - assert(BI->getOpcode() == VPInstruction::BranchOnCond); - if (Src->getSuccessors()[0] == Src->getSuccessors()[1]) { - EdgeMaskCache[{Src, Dst}] = SrcMask; - return SrcMask; - } + assert(Term->getOpcode() == VPInstruction::BranchOnCond && + "Unsupported terminator"); + if (Src->getSuccessors()[0] == Src->getSuccessors()[1]) + return setEdgeMask(Src, Dst, SrcMask); - EdgeMask = BI->getOperand(0); + EdgeMask = Term->getOperand(0); assert(EdgeMask && "No Edge Mask found for condition"); if (Src->getSuccessors()[0] != Dst) - EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); + EdgeMask = Builder.createNot(EdgeMask, Term->getDebugLoc()); if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. - EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); + EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, Term->getDebugLoc()); } - EdgeMaskCache[{Src, Dst}] = EdgeMask; - return EdgeMask; + return setEdgeMask(Src, Dst, EdgeMask); } VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) { @@ -131,7 +133,7 @@ VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) { return EdgeMask; } - if (!BlockMask) { // BlockMask has its initialized nullptr value. + if (!BlockMask) { // BlockMask has its initial nullptr value. BlockMask = EdgeMask; continue; } @@ -159,11 +161,9 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); HeaderVPBB->insert(IV, NewInsertionPoint); - VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - VPValue *BlockMask = nullptr; VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); setBlockInMask(HeaderVPBB, BlockMask); } @@ -179,7 +179,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { for (const auto &[Idx, Succ] : enumerate(ArrayRef(Src->getSuccessors()).drop_front())) { VPBasicBlock *Dst = cast(Succ); - assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); + assert(!getEdgeMask(Src, Dst) && "Edge masks already created"); // Cases whose destination is the same as default are redundant and can // be ignored - they will get there anyhow. if (Dst == DefaultDst) @@ -202,7 +202,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { Mask = Builder.createOr(Mask, V); if (SrcMask) Mask = Builder.createLogicalAnd(SrcMask, Mask); - EdgeMaskCache[{Src, Dst}] = Mask; + setEdgeMask(Src, Dst, Mask); // 2. Create the mask for the default destination, which is reached if // none of the cases with destination != default destination are taken. @@ -216,7 +216,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { if (SrcMask) DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); } - EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; + setEdgeMask(Src, DefaultDst, DefaultMask); } void VPlanTransforms::predicateAndLinearize( @@ -229,7 +229,12 @@ void VPlanTransforms::predicateAndLinearize( ReversePostOrderTraversal> RPOT( Header); VPPredicator Predicator(BlockMaskCache); - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPBlockBase *VPB : RPOT) { + // Only regions with only VPBBs are supported at the moment. + auto *VPBB = cast(VPB); + // Introduce the mask for VPBB, which may introduce needed edge masks, and + // convert all phi recipes of VPBB to blend recipes unless VPBB is the + // header. if (VPBB == Header) { Predicator.createHeaderMask(Header, FoldTail); continue; @@ -241,42 +246,42 @@ void VPlanTransforms::predicateAndLinearize( Predicator.createBlockInMask(VPBB); - for (VPWidenPHIRecipe *Phi : Phis) { - PHINode *IRPhi = cast(Phi->getUnderlyingValue()); - - unsigned NumIncoming = IRPhi->getNumIncomingValues(); - - // We know that all PHIs in non-header blocks are converted into selects, + for (VPWidenPHIRecipe *PhiR : Phis) { + // The non-header Phi is converted into a Blend recipe below, // so we don't have to worry about the insertion order and we can just use // the builder. At this point we generate the predication tree. There may // be duplications since this is a simple recursive scan, but future // optimizations will clean it up. SmallVector OperandsWithMask; + unsigned NumIncoming = PhiR->getNumIncoming(); for (unsigned In = 0; In < NumIncoming; In++) { - const VPBasicBlock *Pred = Phi->getIncomingBlock(In); - OperandsWithMask.push_back(Phi->getIncomingValue(In)); + const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); + OperandsWithMask.push_back(PhiR->getIncomingValue(In)); VPValue *EdgeMask = Predicator.getEdgeMask(Pred, VPBB); if (!EdgeMask) { assert(In == 0 && "Both null and non-null edge masks found"); - assert(all_equal(Phi->operands()) && + assert(all_equal(PhiR->operands()) && "Distinct incoming values with one having a full mask"); break; } OperandsWithMask.push_back(EdgeMask); } + PHINode *IRPhi = cast(PhiR->getUnderlyingValue()); auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask); - Blend->insertBefore(Phi); - Phi->replaceAllUsesWith(Blend); - Phi->eraseFromParent(); + Blend->insertBefore(PhiR); + PhiR->replaceAllUsesWith(Blend); + PhiR->eraseFromParent(); } } + // Linearize the blocks of the loop into one serial chain. VPBlockBase *PrevVPBB = nullptr; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { // Handle VPBBs down to the latch. - if (PrevVPBB && VPBB == LoopRegion->getExiting()) { - VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + if (VPBB == LoopRegion->getExiting()) { + if (PrevVPBB) + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); break; } @@ -284,10 +289,8 @@ void VPlanTransforms::predicateAndLinearize( if (Successors.size() > 1) VPBB->getTerminator()->eraseFromParent(); - // Flatten the CFG in the loop. Masks for blocks have already been - // generated and added to recipes as needed. To do so, first disconnect - // VPBB from its successors. Then connect VPBB to the previously visited - // VPBB. + // Flatten the CFG in the loop. To do so, first disconnect VPBB from its + // successors. Then connect VPBB to the previously visited VPBB. for (auto *Succ : Successors) VPBlockUtils::disconnectBlocks(VPBB, Succ); if (PrevVPBB) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 66f9120aebe59..62c0be97a3a55 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -223,12 +223,12 @@ struct VPlanTransforms { static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth); - /// Predicate and linearize the control-flow in the top-level loop region of + /// Predicate and linearize the control-flow in the only loop region of /// \p Plan. If \p FoldTail is true, also create a mask guarding the loop /// header, otherwise use all-true for the header mask. Masks for blocks are - /// added to \p BlockMaskCache, which in turn is temporarily used for wide - /// recipe construction. This argument is temporary and will be removed in the - /// future. + /// added to \p BlockMaskCache, which in turn will temporarily be used later + /// for wide recipe construction. This argument is temporary and will be + /// removed in the future. static void predicateAndLinearize(VPlan &Plan, bool FoldTail, DenseMap &BlockMaskCache); From 763d667bb138bf81b3e36817ef255b807ff53cb6 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 21 May 2025 13:39:09 +0100 Subject: [PATCH 3/3] !fixup address latest comments, thanks --- .../Transforms/Vectorize/LoopVectorize.cpp | 10 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 8 +- .../Vectorize/VPlanConstruction.cpp | 2 +- .../Transforms/Vectorize/VPlanPredicator.cpp | 117 +++++++++--------- .../Transforms/Vectorize/VPlanTransforms.h | 9 +- 5 files changed, 75 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1e76e8202bbf3..d2d1dad119ea7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9042,12 +9042,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // --------------------------------------------------------------------------- // Predicate and linearize the top-level loop region. // --------------------------------------------------------------------------- - DenseMap BlockMaskCache; - VPlanTransforms::predicateAndLinearize(*Plan, CM.foldTailByMasking(), - BlockMaskCache); + auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize( + *Plan, CM.foldTailByMasking()); // --------------------------------------------------------------------------- - // Construct recipes for the instructions in the loop + // Construct wide recipes and apply predication for original scalar + // VPInstructions in the loop. // --------------------------------------------------------------------------- VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, Builder, BlockMaskCache, LVer); @@ -9138,7 +9138,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // to remove the need to keep a map of masks beyond the predication // transform. RecipeBuilder.updateBlockMaskCache(Old2New); - for (const auto &[Old, New] : Old2New) + for (const auto &[Old, _] : Old2New) Old->getDefiningRecipe()->eraseFromParent(); assert(isa(Plan->getVectorLoopRegion()) && diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 264b1ea3deb97..38ddc6d696e80 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -68,6 +68,9 @@ class VPRecipeBuilder { VPBuilder &Builder; + /// The mask of each VPBB, generated earlier and used for predicating recipes + /// in VPBB. + /// TODO: remove by applying predication when generating the masks. DenseMap &BlockMaskCache; // VPlan construction support: Hold a mapping from ingredients to @@ -180,7 +183,8 @@ class VPRecipeBuilder { Ingredient2Recipe[I] = R; } - /// Returns the *entry* mask for the block \p BB. + /// Returns the *entry* mask for block \p VPBB or null if the mask is + /// all-true. VPValue *getBlockInMask(VPBasicBlock *VPBB) const { return BlockMaskCache.lookup(VPBB); } @@ -209,7 +213,7 @@ class VPRecipeBuilder { return Plan.getOrAddLiveIn(V); } - void updateBlockMaskCache(const DenseMap &Old2New) { + void updateBlockMaskCache(DenseMap &Old2New) { for (auto &[_, V] : BlockMaskCache) { if (auto *New = Old2New.lookup(V)) { V->replaceAllUsesWith(New); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 92bd49ace3638..7d25855d3db1a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -65,7 +65,7 @@ class PlainCFGBuilder { PlainCFGBuilder(Loop *Lp, LoopInfo *LI) : TheLoop(Lp), LI(LI), Plan(std::make_unique(Lp)) {} - /// Build plain CFG for TheLoop and connects it to Plan's entry. + /// Build plain CFG for TheLoop and connect it to Plan's entry. std::unique_ptr buildPlainCFG(); }; } // anonymous namespace diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index e0e0509353639..f692d3910f4b1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -22,7 +22,6 @@ using namespace llvm; namespace { class VPPredicator { - using BlockMaskCacheTy = DenseMap; /// Builder to construct recipes to compute masks. VPBuilder Builder; @@ -31,14 +30,16 @@ class VPPredicator { using EdgeMaskCacheTy = DenseMap, VPValue *>; + using BlockMaskCacheTy = DenseMap; EdgeMaskCacheTy EdgeMaskCache; - BlockMaskCacheTy &BlockMaskCache; + BlockMaskCacheTy BlockMaskCache; /// Create an edge mask for every destination of cases and/or default. void createSwitchEdgeMasks(VPInstruction *SI); - /// Computes and return the predicate of the edge between \p Src and \p Dst. + /// Computes and return the predicate of the edge between \p Src and \p Dst, + /// possibly inserting new recipes at \p Dst (using Builder's insertion point) VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); /// Returns the *entry* mask for \p VPBB. @@ -46,23 +47,25 @@ class VPPredicator { return BlockMaskCache.lookup(VPBB); } + /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not + /// already have a mask. void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) { // TODO: Include the masks as operands in the predicated VPlan directly to - // remove the need to keep a map of masks beyond the predication transform. + // avoid keeping the map of masks beyond the predication transform. assert(!getBlockInMask(VPBB) && "Mask already set"); BlockMaskCache[VPBB] = Mask; } + /// Record \p Mask as the mask of the edge from \p Src to \p Dst. The edge is + /// expected to not have a mask already. VPValue *setEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst, VPValue *Mask) { + assert(Src != Dst && "Src and Dst must be different"); assert(!getEdgeMask(Src, Dst) && "Mask already set"); return EdgeMaskCache[{Src, Dst}] = Mask; } public: - VPPredicator(BlockMaskCacheTy &BlockMaskCache) - : BlockMaskCache(BlockMaskCache) {} - /// Returns the precomputed predicate of the edge from \p Src to \p Dst. VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { return EdgeMaskCache.lookup({Src, Dst}); @@ -72,8 +75,13 @@ class VPPredicator { void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail); /// Compute and return the predicate of \p VPBB, assuming that the header - /// block of the loop is set to True or the loop mask when tail folding. + /// block of the loop is set to True, or to the loop mask when tail folding. VPValue *createBlockInMask(VPBasicBlock *VPBB); + + /// Convert phi recipes in \p VPBB to VPBlendRecipes. + void convertPhisToBlends(VPBasicBlock *VPBB); + + const BlockMaskCacheTy getBlockMaskCache() const { return BlockMaskCache; } }; } // namespace @@ -119,7 +127,8 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) { } VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) { - Builder.setInsertPoint(VPBB, VPBB->begin()); + // Start inserting after the block's phis, which be replaced by blends later. + Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi()); // All-one mask is modelled as no-mask following the convention for masked // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; @@ -156,12 +165,11 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { // constructing the desired canonical IV in the header block as its first // non-phi instructions. - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); auto &Plan = *HeaderVPBB->getPlan(); auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, NewInsertionPoint); + Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + Builder.insert(IV); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); setBlockInMask(HeaderVPBB, BlockMask); @@ -170,8 +178,8 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { VPBasicBlock *Src = SI->getParent(); - // Create masks where the terminator in Src is a switch. We create mask for - // all edges at the same time. This is more efficient, as we can create and + // Create masks where SI is a switch. We create masks for all edges from SI's + // parent block at the same time. This is more efficient, as we can create and // collect compares for all cases once. VPValue *Cond = SI->getOperand(0); VPBasicBlock *DefaultDst = cast(Src->getSuccessors()[0]); @@ -219,18 +227,48 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { setEdgeMask(Src, DefaultDst, DefaultMask); } -void VPlanTransforms::predicateAndLinearize( - VPlan &Plan, bool FoldTail, - DenseMap &BlockMaskCache) { +void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { + for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) { + // The non-header Phi is converted into a Blend recipe below, + // so we don't have to worry about the insertion order and we can just use + // the builder. At this point we generate the predication tree. There may + // be duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + auto *PhiR = cast(&R); + + SmallVector OperandsWithMask; + unsigned NumIncoming = PhiR->getNumIncoming(); + for (unsigned In = 0; In < NumIncoming; In++) { + const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); + OperandsWithMask.push_back(PhiR->getIncomingValue(In)); + VPValue *EdgeMask = getEdgeMask(Pred, VPBB); + if (!EdgeMask) { + assert(In == 0 && "Both null and non-null edge masks found"); + assert(all_equal(PhiR->operands()) && + "Distinct incoming values with one having a full mask"); + break; + } + OperandsWithMask.push_back(EdgeMask); + } + PHINode *IRPhi = cast(PhiR->getUnderlyingValue()); + auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask); + Builder.insert(Blend); + PhiR->replaceAllUsesWith(Blend); + PhiR->eraseFromParent(); + } +} + +DenseMap +VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); ReversePostOrderTraversal> RPOT( Header); - VPPredicator Predicator(BlockMaskCache); + VPPredicator Predicator; for (VPBlockBase *VPB : RPOT) { - // Only regions with only VPBBs are supported at the moment. + // Non-outer regions with VPBBs only are supported at the moment. auto *VPBB = cast(VPB); // Introduce the mask for VPBB, which may introduce needed edge masks, and // convert all phi recipes of VPBB to blend recipes unless VPBB is the @@ -240,51 +278,13 @@ void VPlanTransforms::predicateAndLinearize( continue; } - SmallVector Phis; - for (VPRecipeBase &R : VPBB->phis()) - Phis.push_back(cast(&R)); - Predicator.createBlockInMask(VPBB); - - for (VPWidenPHIRecipe *PhiR : Phis) { - // The non-header Phi is converted into a Blend recipe below, - // so we don't have to worry about the insertion order and we can just use - // the builder. At this point we generate the predication tree. There may - // be duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - SmallVector OperandsWithMask; - unsigned NumIncoming = PhiR->getNumIncoming(); - for (unsigned In = 0; In < NumIncoming; In++) { - const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); - OperandsWithMask.push_back(PhiR->getIncomingValue(In)); - VPValue *EdgeMask = Predicator.getEdgeMask(Pred, VPBB); - if (!EdgeMask) { - assert(In == 0 && "Both null and non-null edge masks found"); - assert(all_equal(PhiR->operands()) && - "Distinct incoming values with one having a full mask"); - break; - } - OperandsWithMask.push_back(EdgeMask); - } - PHINode *IRPhi = cast(PhiR->getUnderlyingValue()); - auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask); - Blend->insertBefore(PhiR); - PhiR->replaceAllUsesWith(Blend); - PhiR->eraseFromParent(); - } + Predicator.convertPhisToBlends(VPBB); } // Linearize the blocks of the loop into one serial chain. VPBlockBase *PrevVPBB = nullptr; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { - // Handle VPBBs down to the latch. - if (VPBB == LoopRegion->getExiting()) { - if (PrevVPBB) - VPBlockUtils::connectBlocks(PrevVPBB, VPBB); - break; - } - auto Successors = to_vector(VPBB->getSuccessors()); if (Successors.size() > 1) VPBB->getTerminator()->eraseFromParent(); @@ -298,4 +298,5 @@ void VPlanTransforms::predicateAndLinearize( PrevVPBB = VPBB; } + return Predicator.getBlockMaskCache(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 62c0be97a3a55..36fc78ce566b2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -224,14 +224,13 @@ struct VPlanTransforms { unsigned VectorRegWidth); /// Predicate and linearize the control-flow in the only loop region of - /// \p Plan. If \p FoldTail is true, also create a mask guarding the loop + /// \p Plan. If \p FoldTail is true, create a mask guarding the loop /// header, otherwise use all-true for the header mask. Masks for blocks are - /// added to \p BlockMaskCache, which in turn will temporarily be used later + /// added to a block-to-mask map which is returned in order to be used later /// for wide recipe construction. This argument is temporary and will be /// removed in the future. - static void - predicateAndLinearize(VPlan &Plan, bool FoldTail, - DenseMap &BlockMaskCache); + static DenseMap + introduceMasksAndLinearize(VPlan &Plan, bool FoldTail); }; } // namespace llvm