llvm · john-brawn-arm · Jul 31, 2025 · Sep 4, 2025 · Sep 8, 2025 · Sep 11, 2025
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6899,46 +6899,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     }
   }
 
-  /// Compute the cost of all exiting conditions of the loop using the legacy
-  /// cost model. This is to match the legacy behavior, which adds the cost of
-  /// all exit conditions. Note that this over-estimates the cost, as there will
-  /// be a single condition to control the vector loop.
-  SmallVector<BasicBlock *> Exiting;
-  CM.TheLoop->getExitingBlocks(Exiting);
-  SetVector<Instruction *> ExitInstrs;
-  // Collect all exit conditions.
-  for (BasicBlock *EB : Exiting) {
-    auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
-    if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
-      continue;
-    if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
-      ExitInstrs.insert(CondI);
-    }
-  }
-  // Compute the cost of all instructions only feeding the exit conditions.
-  for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
-    Instruction *CondI = ExitInstrs[I];
-    if (!OrigLoop->contains(CondI) ||
-        !CostCtx.SkipCostComputation.insert(CondI).second)
-      continue;
-    InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
-    LLVM_DEBUG({
-      dbgs() << "Cost of " << CondICost << " for VF " << VF
-             << ": exit condition instruction " << *CondI << "\n";
-    });
-    Cost += CondICost;
-    for (Value *Op : CondI->operands()) {
-      auto *OpI = dyn_cast<Instruction>(Op);
-      if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
-          any_of(OpI->users(), [&ExitInstrs, this](User *U) {
-            return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
-                   !ExitInstrs.contains(cast<Instruction>(U));
-          }))
-        continue;
-      ExitInstrs.insert(OpI);
-    }
-  }
-
   // Pre-compute the costs for branches except for the backedge, as the number
   // of replicate regions in a VPlan may not directly match the number of
   // branches, which would lead to different decisions.
@@ -7058,8 +7018,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
       }
       // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
       // cost model won't cost it whilst the legacy will.
+      using namespace VPlanPatternMatch;
       if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
-        using namespace VPlanPatternMatch;
         if (none_of(FOR->users(),
                     match_fn(m_VPInstruction<
                              VPInstruction::FirstOrderRecurrenceSplice>())))
@@ -7090,10 +7050,19 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
                 RepR->getUnderlyingInstr(), VF))
           return true;
       }
+
+      // The VPlan-based cost model knows that if TC <= VF then no compare is
+      // needed in branch-on-count, but the legacy cost model does not.
+      if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) {
+        Value *TC = Plan.getTripCount()->getUnderlyingValue();
+        ConstantInt *TCConst = dyn_cast_if_present<ConstantInt>(TC);
+        if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue()))
+          return true;
+      }
+
       if (Instruction *UI = GetInstructionForCost(&R)) {
         // If we adjusted the predicate of the recipe, the cost in the legacy
         // cost model may be different.
-        using namespace VPlanPatternMatch;
         CmpPredicate Pred;
         if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
             cast<VPRecipeWithIRFlags>(R).getPredicate() !=
@@ -7104,6 +7073,37 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
     }
   }
 
+  // The VPlan may have been transformed such that the original loop exit
+  // condition, and the instructions that are used only by it, no longer
+  // exists, but we expect the transformed version to have the same cost.
+  // Therefore mark all such instructions as seen.
+  SmallVector<BasicBlock *> Exiting;
+  CostCtx.CM.TheLoop->getExitingBlocks(Exiting);
+  SetVector<Instruction *> ExitInstrs;
+  // Collect all exit conditions.
+  for (BasicBlock *EB : Exiting) {
+    auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
+    if (!Term)
+      continue;
+    if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0)))
+      ExitInstrs.insert(CondI);
+  }
+  // Collect all instructions only feeding the exit conditions.
+  for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
+    Instruction *CondI = ExitInstrs[I];
+    SeenInstrs.insert(CondI);
+    for (Value *Op : CondI->operands()) {
+      auto *OpI = dyn_cast<Instruction>(Op);
+      if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
+          any_of(OpI->users(), [&ExitInstrs, TheLoop](User *U) {
+            return TheLoop->contains(cast<Instruction>(U)->getParent()) &&
+                   !ExitInstrs.contains(cast<Instruction>(U));
+          }))
+        continue;
+      ExitInstrs.insert(OpI);
+    }
+  }
+
   // Return true if the loop contains any instructions that are not also part of
   // the VPlan or are skipped for VPlan-based cost computations. This indicates
   // that the VPlan contains extra simplifications.
@@ -7119,6 +7119,39 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
     });
   });
 }
+
+static bool planContainsDifferentCompares(VPlan &Plan, VPCostContext &CostCtx,
+                                          Loop *TheLoop, ElementCount VF) {
+  // Count how many compare instructions there are in the legacy cost model.
+  unsigned NumLegacyCompares = 0;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    for (auto &I : *BB) {
+      if (isa<CmpInst>(I)) {
+        NumLegacyCompares += 1;
+      }
+    }
+  }
+
+  // Count how many compare instructions there are in the VPlan.
+  unsigned NumVPlanCompares = 0;
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  auto Iter = vp_depth_first_deep(VectorRegion->getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+    // Only the blocks in the vector region are relevant.
+    if (VPBB->getEnclosingLoopRegion() != VectorRegion)
+      continue;
+    for (VPRecipeBase &R : *VPBB) {
+      using namespace VPlanPatternMatch;
+      if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+          match(&R, m_Cmp(m_VPValue(), m_VPValue())))
+        NumVPlanCompares += 1;
+    }
+  }
+
+  // If we have a different amount, then the legacy cost model and vplan will
+  // disagree.
+  return NumLegacyCompares != NumVPlanCompares;
+}
 #endif
 
 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
@@ -7226,6 +7259,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // * VPlans with additional VPlan simplifications,
   // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
   //   vp_scatter/vp_gather).
+  // * VPlans containing a different number of compare instructions to what's
+  //   present in the original scalar loop.
   // The legacy cost model doesn't properly model costs for such loops.
   bool UsesEVLGatherScatter =
       any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
@@ -7242,7 +7277,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
        planContainsAdditionalSimplifications(
            getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
        planContainsAdditionalSimplifications(
-           getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
+           getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width) ||
+       planContainsDifferentCompares(BestPlan, CostCtx, OrigLoop,
+                                     BestFactor.Width)) &&
       " VPlan cost model and legacy cost model disagreed");
   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
          "when vectorizing, the scalar cost must be computed.");

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1109,6 +1109,30 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
     return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
                                                     VecTy, Ctx.CostKind, 0);
   }
+  case VPInstruction::BranchOnCount: {
+    // If TC <= VF then this is just a branch.
+    // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF
+    // where it checks TC <= VF * UF, but we don't know UF yet. This means in
+    // some cases we get a cost that's too high due to counting a cmp that
+    // later gets removed.
+    // FIXME: The compare could also be removed if TC = M * vscale,
+    // VF = N * vscale, and M <= N. Detecting that would require having the
+    // trip count as a SCEV though.
+    Value *TC = getParent()->getPlan()->getTripCount()->getUnderlyingValue();
+    ConstantInt *TCConst = dyn_cast_if_present<ConstantInt>(TC);
+    if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue()))
+      return 0;
+    // Otherwise BranchOnCount generates ICmpEQ followed by a branch.
+    Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));
+    return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy,
+                                      CmpInst::makeCmpResultType(ValTy),
+                                      CmpInst::ICMP_EQ, Ctx.CostKind);
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return getCostForRecipeWithOpcode(
+        getOpcode(),
+        vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
   case VPInstruction::ExtractPenultimateElement:
     if (VF == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();