Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
861d883
[VPlan] Don't use the legacy cost model for loop conditions
john-brawn-arm Jul 31, 2025
c3f2e8f
Do TC <= VF check differently to avoid llvm::PatternMatch error
john-brawn-arm Sep 4, 2025
77a2769
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Sep 8, 2025
19b984c
Add extra comment to VPInstruction::computeCost
john-brawn-arm Sep 11, 2025
2b83698
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Sep 16, 2025
ec22b08
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Sep 18, 2025
1bf3611
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Sep 22, 2025
2a6c490
Use VPlanPatternMatch for counting the number of compares
john-brawn-arm Sep 23, 2025
6e23400
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Oct 1, 2025
24fa802
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Oct 7, 2025
515264b
Use getCostForRecipeWithOpcode for cmp cost
john-brawn-arm Oct 15, 2025
50b801f
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Oct 20, 2025
2d3999b
Fix tests after merge
john-brawn-arm Oct 20, 2025
33b76ce
Adjust cmp cost calculation
john-brawn-arm Oct 22, 2025
8c21ea3
Make planContainsDifferentCompares ignore blocks outside the vector r…
john-brawn-arm Oct 22, 2025
759741c
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Oct 23, 2025
60b76a2
Update test after merge
john-brawn-arm Oct 23, 2025
8a4c2ad
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Oct 24, 2025
ae498b8
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Nov 3, 2025
a6bac7a
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Nov 10, 2025
f41782c
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Nov 18, 2025
7d21018
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Nov 25, 2025
b175ee3
Update test after merge
john-brawn-arm Nov 25, 2025
bfe6fc0
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Nov 26, 2025
7054e6b
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Dec 4, 2025
7ec4f3d
Adjust planContainsAdditionalSimplifications so loop exit conditions …
john-brawn-arm Dec 8, 2025
40a1450
Merge branch 'main' into vplan_cmp_cost
john-brawn-arm Dec 15, 2025
a4a2f8c
Update tests
john-brawn-arm Dec 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 80 additions & 43 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6899,46 +6899,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
}
}

/// Compute the cost of all exiting conditions of the loop using the legacy
/// cost model. This is to match the legacy behavior, which adds the cost of
/// all exit conditions. Note that this over-estimates the cost, as there will
/// be a single condition to control the vector loop.
SmallVector<BasicBlock *> Exiting;
CM.TheLoop->getExitingBlocks(Exiting);
SetVector<Instruction *> ExitInstrs;
// Collect all exit conditions.
for (BasicBlock *EB : Exiting) {
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
continue;
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
ExitInstrs.insert(CondI);
}
}
// Compute the cost of all instructions only feeding the exit conditions.
for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
Instruction *CondI = ExitInstrs[I];
if (!OrigLoop->contains(CondI) ||
!CostCtx.SkipCostComputation.insert(CondI).second)
continue;
InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
LLVM_DEBUG({
dbgs() << "Cost of " << CondICost << " for VF " << VF
<< ": exit condition instruction " << *CondI << "\n";
});
Cost += CondICost;
for (Value *Op : CondI->operands()) {
auto *OpI = dyn_cast<Instruction>(Op);
if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
any_of(OpI->users(), [&ExitInstrs, this](User *U) {
return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
!ExitInstrs.contains(cast<Instruction>(U));
}))
continue;
ExitInstrs.insert(OpI);
}
}

// Pre-compute the costs for branches except for the backedge, as the number
// of replicate regions in a VPlan may not directly match the number of
// branches, which would lead to different decisions.
Expand Down Expand Up @@ -7058,8 +7018,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
// Unused FOR splices are removed by VPlan transforms, so the VPlan-based
// cost model won't cost it whilst the legacy will.
using namespace VPlanPatternMatch;
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
using namespace VPlanPatternMatch;
if (none_of(FOR->users(),
match_fn(m_VPInstruction<
VPInstruction::FirstOrderRecurrenceSplice>())))
Expand Down Expand Up @@ -7090,10 +7050,19 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}

// The VPlan-based cost model knows that if TC <= VF then no compare is
// needed in branch-on-count, but the legacy cost model does not.
if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) {
Value *TC = Plan.getTripCount()->getUnderlyingValue();
ConstantInt *TCConst = dyn_cast_if_present<ConstantInt>(TC);
if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue()))
return true;
}

if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
using namespace VPlanPatternMatch;
CmpPredicate Pred;
if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
cast<VPRecipeWithIRFlags>(R).getPredicate() !=
Expand All @@ -7104,6 +7073,37 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
}

// The VPlan may have been transformed such that the original loop exit
// condition, and the instructions that are used only by it, no longer
// exists, but we expect the transformed version to have the same cost.
// Therefore mark all such instructions as seen.
SmallVector<BasicBlock *> Exiting;
CostCtx.CM.TheLoop->getExitingBlocks(Exiting);
SetVector<Instruction *> ExitInstrs;
// Collect all exit conditions.
for (BasicBlock *EB : Exiting) {
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
if (!Term)
continue;
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0)))
ExitInstrs.insert(CondI);
}
// Collect all instructions only feeding the exit conditions.
for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
Instruction *CondI = ExitInstrs[I];
SeenInstrs.insert(CondI);
for (Value *Op : CondI->operands()) {
auto *OpI = dyn_cast<Instruction>(Op);
if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
any_of(OpI->users(), [&ExitInstrs, TheLoop](User *U) {
return TheLoop->contains(cast<Instruction>(U)->getParent()) &&
!ExitInstrs.contains(cast<Instruction>(U));
}))
continue;
ExitInstrs.insert(OpI);
}
}

// Return true if the loop contains any instructions that are not also part of
// the VPlan or are skipped for VPlan-based cost computations. This indicates
// that the VPlan contains extra simplifications.
Expand All @@ -7119,6 +7119,39 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
});
});
}

static bool planContainsDifferentCompares(VPlan &Plan, VPCostContext &CostCtx,
Loop *TheLoop, ElementCount VF) {
// Count how many compare instructions there are in the legacy cost model.
unsigned NumLegacyCompares = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
for (auto &I : *BB) {
if (isa<CmpInst>(I)) {
NumLegacyCompares += 1;
}
}
}

// Count how many compare instructions there are in the VPlan.
unsigned NumVPlanCompares = 0;
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
auto Iter = vp_depth_first_deep(VectorRegion->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vp_depth_first_deep will also leave the region and visit its successors, so we will also count the compare in the middle block, and almost always overcount the compares in VPLan. probably needs to check if we left the region.

I think it will also always disable the check if we have loops controlled by active-lane-mask?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed this to ignore blocks outside of the vector loop region.

I think it will also always disable the check if we have loops controlled by active-lane-mask?

I'm not sure what you're asking here. When we have a loop that's using an active-lane-mask, the vplan will have something like (example here taken from llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll)

Cost of 1 for VF vscale x 4: EMIT vp<%active.lane.mask.next> = active lane mask vp<%10>, vp<%4>, ir<1>
Cost of 0 for VF vscale x 4: EMIT vp<%11> = not vp<%active.lane.mask.next>
Cost of 0 for VF vscale x 4: EMIT branch-on-cond vp<%11>

the legacy cost model will have

LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %exitcond.not = icmp eq i64 %iv.next, %n
LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   br i1 %exitcond.not, label %for.end, label %for.body

planContainsDifferentCompares would count 1 compare in the legacy cost model, no compares in the vplan, and return true.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

planContainsDifferentCompares would count 1 compare in the legacy cost model, no compares in the vplan, and return true.

Yep, what I was wondering was if we could exclude plans with ActiveLaneMask terminated exiting blocks from the carve-out, to preserve the original check for those?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, it's still not clear what you're asking here. Do you mean: planContainsDifferentCompares should return false for plans that contain ActiveLaneMask terminated exiting blocks, so that the assert in computeBestVF that calls planContainsDifferentCompares will do the check BestFactor.Width == LegacyVF.Width? If so then possibly we could, though I don't think it would make a difference as I haven't found an example where planContainsAdditionalSimplifications doesn't also return true (which it will do because the cmp in the legacy cost model doesn't correspond to anything in the vplan).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for checking!

It looks like there's still something related to the changes that effectively disables the assertion altogether. I tried the patch with something like the diff below which should cause lots of crashes, but it seems there aren't?

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7c9302860a3b..4d821ca0954c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2270,6 +2270,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
   case Instruction::ExtractValue:
   case Instruction::ICmp:
   case Instruction::FCmp:
+    return 100;
     return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
   default:
     llvm_unreachable("Unsupported opcode for instruction");

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It sounds like if planContainsAdditionalSimplifications is returning true for most loops then there's probably not much point in having the legacy/vplan cost model assert anymore? cc @fhahn

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue with bailing out on some wide accesses should be fixed as of #169249.

I've changed this to ignore blocks outside of the vector loop region.
It looks like those changes may have been dropped by accident, at least it looks like the current code would still leave the parent region via the deep traversal iterator?

One example that should crash with the PR updated the current main and the change below is llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll, but it does not, presumably because we count the compare in the middle.block?

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c04f510dd995..048a298ef3ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3582,6 +3582,7 @@ void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,

 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
                                                  VPCostContext &Ctx) const {
+  return 100;
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                     ->getAddressSpace();

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In every function in llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll planContainsAdditionalSimplifications is returning true: In the first function %lftr.wideiv = trunc i64 %iv.next to i32 doesn't appear in the list of seen instructions, in the second %cond = icmp slt i64 %i.next, %n, in the third %cmp.not = icmp eq i64 %iv.next, %0. It looks like all of these are due to the loop termination condition being converted to branch-on-count.

Out of the tests in llvm/tests/Transforms/LoopVectorize, only in the following does planContainsAdditionalSimplifications return false for at least one of the test functions:

  LLVM :: Transforms/LoopVectorize/AArch64/call-costs.ll
  LLVM :: Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
  LLVM :: Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
  LLVM :: Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
  LLVM :: Transforms/LoopVectorize/AArch64/optsize_minsize.ll
  LLVM :: Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
  LLVM :: Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
  LLVM :: Transforms/LoopVectorize/ARM/optsize_minsize.ll
  LLVM :: Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
  LLVM :: Transforms/LoopVectorize/X86/cost-model.ll
  LLVM :: Transforms/LoopVectorize/X86/interleave-cost.ll
  LLVM :: Transforms/LoopVectorize/X86/pr34438.ll
  LLVM :: Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
  LLVM :: Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
  LLVM :: Transforms/LoopVectorize/vectorize-zero-estimated-trip-count.ll

Strangely, with the above change in these files planContainsAdditionalSimplifications changes to returning true. I think what's going on is that because of the increase in cost BestPlan is now the scalar plan, so planContainsAdditionalSimplifications is checking the scalar plan whereas before it was checking the vector plan, and there's something in the scalar plan that causes planContainsAdditionalSimplifications to return true.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm so I think what is going on is that the original compare will almost always be removed (except if there is another user).

Previously the code would mark all exit instructions to be skipped for cost-computtion (and in turn skipped in planContainsAdditionalSimplifications). But now we don't, so the difference between IR compare and BranchOnCount will cause planContainsAdditionalSimplifications to return true.

Could we still add the IR instructions to the set to skip?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we still add the IR instructions to the set to skip?

I've made this change. Doing this we also need to detect when VPInstruction::computeCost will return zero for BranchOnCount, as otherwise the assertion fails in llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll.

// Only the blocks in the vector region are relevant.
if (VPBB->getEnclosingLoopRegion() != VectorRegion)
continue;
for (VPRecipeBase &R : *VPBB) {
using namespace VPlanPatternMatch;
if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
match(&R, m_Cmp(m_VPValue(), m_VPValue())))
NumVPlanCompares += 1;
}
}

// If we have a different amount, then the legacy cost model and vplan will
// disagree.
return NumLegacyCompares != NumVPlanCompares;
}
#endif

VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
Expand Down Expand Up @@ -7226,6 +7259,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// * VPlans with additional VPlan simplifications,
// * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
// vp_scatter/vp_gather).
// * VPlans containing a different number of compare instructions to what's
// present in the original scalar loop.
// The legacy cost model doesn't properly model costs for such loops.
bool UsesEVLGatherScatter =
any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
Expand All @@ -7242,7 +7277,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
planContainsAdditionalSimplifications(
getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
planContainsAdditionalSimplifications(
getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width) ||
planContainsDifferentCompares(BestPlan, CostCtx, OrigLoop,
BestFactor.Width)) &&
" VPlan cost model and legacy cost model disagreed");
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
"when vectorizing, the scalar cost must be computed.");
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1109,6 +1109,30 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
VecTy, Ctx.CostKind, 0);
}
case VPInstruction::BranchOnCount: {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think BranchOnCond needs adding too.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BranchOnCond doesn't cause a compare instruction to be generated, it uses the condition generated by another instruction.

// If TC <= VF then this is just a branch.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what this means. Are you saying that we create a vplan for a given VF despite knowing that we will never enter the vector loop? I guess this can happen if TC is exactly equal to VF or we're using tail-folding, but not using the mask for control flow.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in the comment below this, this transformation is happening in simplifyBranchConditionForVFAndUF and means the vector loop is executed exactly once. TC < VF can happen with tail folding, e.g. low_trip_count_fold_tail_scalarized_store in llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll.

// FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you're saying the cost of this branch is based on a prediction about what simplifyBranchConditionForVFAndUF is going to do later on? I guess that's fine so long as they're both using the same logic. Ideally both simplifyBranchConditionForVFAndUF and this code would call the same common function checking if the branch will be simplified or not. I'm just a bit worried that over time the two will diverge. Although I appreciate here in the code you'd have to assume UF=1.

For example, if you pulled this code out of simplifyBranchConditionForVFAndUF into a common function you could reuse it in both places:

    // Try to simplify the branch condition if TC <= VF * UF when the latch
    // terminator is   BranchOnCount or BranchOnCond where the input is
    // Not(ActiveLaneMask).
    const SCEV *TripCount =
        vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
    assert(!isa<SCEVCouldNotCompute>(TripCount) &&
           "Trip count SCEV must be computable");
    ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
    const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
    if (TripCount->isZero() ||
        !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
      return false;

You'd also now be able to remove the // FIXME: The compare could also be removed if TC = M * vscale, comment below.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll look into doing that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's the way to go, we shouldn't duplicate that kind of reasoning here. The TODO should say that the branch should be simplified before we compute the costs.

As a workaround for now catching some cases here should be fine, as this should only mean we may miss some new optimizations, but not make things worse

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I do think we should be dealing with trip counts that are multiples of vscale here too since we now support them and we know that simplifyBranchConditionForVFAndUF should correctly detect TC=3 x vscale < VF=4 x vscale. It would seem unfair to treat TC=3 <= VF=4 as a cost of 0 and TC=3 * vscale <= VF=4 * vscale as a cost of 1 just so we can keep the code simple, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After looking into this, the problem here is that we don't have access to the ScalarEvolution object here in VPInstruction, so putting code from simplifyBranchConditionForVFAndUF into a function and calling it won't work as that makes use of ScalarEvolution. Simplifying the branch before we compute the cost seems like a good solution to this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK no problem. I wasn't sure how easy it would be, but thanks for looking into it! I can follow up with a later PR to get access to the SCEV here.

// where it checks TC <= VF * UF, but we don't know UF yet. This means in
// some cases we get a cost that's too high due to counting a cmp that
// later gets removed.
// FIXME: The compare could also be removed if TC = M * vscale,
// VF = N * vscale, and M <= N. Detecting that would require having the
// trip count as a SCEV though.
Value *TC = getParent()->getPlan()->getTripCount()->getUnderlyingValue();
ConstantInt *TCConst = dyn_cast_if_present<ConstantInt>(TC);
if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a TODO for the case where TC=vscale x M and VF=vscale * N as well? In such cases we should also be able to prove that TC <= VF because it just requires asking if M <= N.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

return 0;
// Otherwise BranchOnCount generates ICmpEQ followed by a branch.
Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy,
CmpInst::makeCmpResultType(ValTy),
CmpInst::ICMP_EQ, Ctx.CostKind);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you need to add the cost of the branch as well.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've deliberately avoided touching branch costs to keep the scope of this work as small as possible.

}
case Instruction::FCmp:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change below is about more than just about the loop conditions. See VPPredicator::createHeaderMask for an example where we explicitly introduce a icmp for the current tail-folding mask. In fact, I don't think the cost below is correct because the icmp can have vector inputs.

I think you either need to:

  1. Find a way to bail out if the icmp/fcmp isn't used as a branch condition, or
  2. Add support for vector types using ValTY = toVectorTy(ValTy, VF), and just make sure the example of VPPredicator::createHeaderMask is being tested in this PR.

Sorry about this!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getCostForRecipeWithOpcode already correctly handles vector compares, so I've changed this to use that function.

case Instruction::ICmp:
return getCostForRecipeWithOpcode(
getOpcode(),
vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
case VPInstruction::ExtractPenultimateElement:
if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
Expand Down
Loading