llvm · arcbbb · Feb 21, 2026 · Feb 24, 2026 · Feb 24, 2026 · fhahn
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -802,6 +802,14 @@ const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
   return Pred->getExitingBasicBlock();
 }
 
+VPHeaderPHIRecipe *VPRegionBlock::getCurrentIteration() {
+  VPCurrentIterationPHIRecipe *CurrentIteration =
+      vputils::getCurrentIterationPhi(*getPlan());
+  if (CurrentIteration)
+    return CurrentIteration;
+  return getCanonicalIV();
+}
+
 InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   if (!isReplicator()) {
     InstructionCost Cost = 0;

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4472,6 +4472,10 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   const Type *getCanonicalIVType() const {
     return getCanonicalIV()->getScalarType();
   }
+
+  /// Return VPCurrentIterationPHIRecipe for variable-length
+  /// stepping loops, or VPCanonicalIVPHIRecipe otherwise.
+  VPHeaderPHIRecipe *getCurrentIteration();
 };
 
 inline VPRegionBlock *VPRecipeBase::getRegion() {

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3090,7 +3090,7 @@ void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
   VPValue *HeaderMask = nullptr, *EVL = nullptr;
   for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
     if (match(&R, m_SpecificICmp(CmpInst::ICMP_ULT, m_StepVector(),
-                                 m_VPValue(EVL))) &&
+                                 m_ZExtOrSelf(m_VPValue(EVL)))) &&
         match(EVL, m_EVL(m_VPValue()))) {
       HeaderMask = R.getVPSingleValue();
       break;
@@ -3120,19 +3120,44 @@ void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
   }
 }
 
-/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
-/// VF to use the EVL instead to avoid incorrect updates on the penultimate
-/// iteration.
-static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
-  VPTypeAnalysis TypeInfo(Plan);
+/// Add a VPCurrentIterationPHIRecipe to \p Plan with \p NewStep as the step
+/// value, replace all uses of VPCanonicalIVPHIRecipe with
+/// VPCurrentIterationPHIRecipe except for the canonical IV increment, and
+/// fixup recipes that use VF to use \p NewStep instead. After this
+/// transformation, VPCanonicalIVPHIRecipe is used only for loop iterations
+/// counting.
+static void addCurrentIterationPhi(VPlan &Plan, VPValue *NewStep) {
-static void addCurrentIterationPhi(VPlan &Plan, VPValue *NewStep) {
+static void addCurrentIterationPhi(VPlan &Plan, VPSingleDefRecipe *NewStep) {
-static void addCurrentIterationPhi(VPlan &Plan, VPValue *NewStep) {
+static void addCurrentIterationPhi(VPlan &Plan, VPSingleDefRecipe *NewStep) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
-  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
+  auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
+  auto *CanIVTy = LoopRegion->getCanonicalIVType();
+  auto *CanonicalIVIncrement =
+      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+  VPValue *StartV = CanonicalIVPHI->getStartValue();
+  // Create the CurrentIteration recipe in the vector loop.
+  auto *CurrentIteration =
+      new VPCurrentIterationPHIRecipe(StartV, DebugLoc::getUnknown());
+  CurrentIteration->insertAfter(CanonicalIVPHI);
+  VPBuilder Builder(CanonicalIVIncrement);
+  VPTypeAnalysis TypeInfo(Plan);
+  auto *NextIter = Builder.createAdd(NewStep, CurrentIteration,
+                                     CanonicalIVIncrement->getDebugLoc(),
+                                     "current.iteration.next",
+                                     {CanonicalIVIncrement->hasNoUnsignedWrap(),
+                                      CanonicalIVIncrement->hasNoSignedWrap()});
+  CurrentIteration->addOperand(NextIter);
+
+  // Replace all uses of VPCanonicalIVPHIRecipe by
+  // VPCurrentIterationPHIRecipe except for the canonical IV increment.
+  CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
+  CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
+
+  // Fixup recipes that use VF to use NewStep instead.
   assert(all_of(Plan.getVF().users(),
                 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
                         VPWidenIntOrFpInductionRecipe>) &&
-         "User of VF that we can't transform to EVL.");
-  Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+         "User of VF that we can't transform to variable-length step.");
+  Plan.getVF().replaceUsesWithIf(NewStep, [](VPUser &U, unsigned Idx) {
     return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
   });
 
@@ -3145,28 +3170,43 @@ static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
                 }) &&
          "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
          "increment of the canonical induction.");
-  Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+  Plan.getVFxUF().replaceUsesWithIf(NewStep, [](VPUser &U, unsigned Idx) {
     // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
     // canonical induction must not be updated.
     return isa<VPWidenPointerInductionRecipe>(U);
   });
 
-  // Create a scalar phi to track the previous EVL if fixed-order recurrence is
-  // contained.
+  // Create a scalar phi to track the previous variable-length step if
+  // fixed-order recurrence is contained.
+  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   bool ContainsFORs =
       any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
   if (ContainsFORs) {
-    // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
-    VPValue *MaxEVL = &Plan.getVF();
-    // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
+    // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits
+    // integer.
     VPBuilder Builder(LoopRegion->getPreheaderVPBB());
-    MaxEVL = Builder.createScalarZExtOrTrunc(
-        MaxEVL, Type::getInt32Ty(Plan.getContext()),
-        TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
+    VPValue *VFI32 = Builder.createScalarZExtOrTrunc(
+        &Plan.getVF(), Type::getInt32Ty(Plan.getContext()), CanIVTy,
+        DebugLoc::getUnknown());
+
+    VPValue *StepI32;
+    VPValue *StepSource;
-    VPValue *StepI32;
-    VPValue *StepSource;
+    VPValue *StepI32, *StepSource;
-    VPValue *StepI32;
-    VPValue *StepSource;
+    VPValue *StepI32, *StepSource;
+    if (match(NewStep, m_ZExt(m_VPValue(StepSource))) &&
+        TypeInfo.inferScalarType(StepSource) ==
+            Type::getInt32Ty(Plan.getContext())) {
+      StepI32 = StepSource;
+    } else {
+      VPRecipeBase *StepR = NewStep->getDefiningRecipe();
+      Builder.setInsertPoint(StepR->getParent(),
+                             std::next(StepR->getIterator()));
+      StepI32 = Builder.createScalarZExtOrTrunc(
+          NewStep, Type::getInt32Ty(Plan.getContext()), CanIVTy,
+          DebugLoc::getUnknown());
+    }
 
     Builder.setInsertPoint(Header, Header->getFirstNonPhi());
-    VPValue *PrevEVL = Builder.createScalarPhi(
-        {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
+    VPValue *PrevStep = Builder.createScalarPhi(
+        {VFI32, StepI32}, DebugLoc::getUnknown(), "prev.step");
 
     for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
              vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
@@ -3180,7 +3220,7 @@ static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
             ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
             Intrinsic::experimental_vp_splice,
-            {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
+            {V1, V2, Imm, Plan.getTrue(), PrevStep, StepI32},
             TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
             R.getDebugLoc());
         VPSplice->insertBefore(&R);
@@ -3193,30 +3233,24 @@ static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
   if (!HeaderMask)
     return;
 
-  // Replace header masks with a mask equivalent to predicating by EVL:
+  // Replace header masks with a mask equivalent to predicating by
+  // variable-length step:
-  // variable-length step:
+  // NewStep:
-  // variable-length step:
+  // NewStep:
   //
   // icmp ule widen-canonical-iv backedge-taken-count
   // ->
-  // icmp ult step-vector, EVL
-  VPRecipeBase *EVLR = EVL.getDefiningRecipe();
-  VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
-  Type *EVLType = TypeInfo.inferScalarType(&EVL);
-  VPValue *EVLMask = Builder.createICmp(
+  // icmp ult step-vector, NewStep
+  VPRecipeBase *StepR = NewStep->getDefiningRecipe();
+  Builder.setInsertPoint(StepR->getParent(), std::next(StepR->getIterator()));
+  VPValue *NewMask = Builder.createICmp(
       CmpInst::ICMP_ULT,
-      Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
-  HeaderMask->replaceAllUsesWith(EVLMask);
+      Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy), NewStep);
+  HeaderMask->replaceAllUsesWith(NewMask);
 }
 
 /// Converts a tail folded vector loop region to step by
 /// VPInstruction::ExplicitVectorLength elements instead of VF elements each
 /// iteration.
 ///
-/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
-///   replaces all uses except the canonical IV increment of
-///   VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
-///   VPCanonicalIVPHIRecipe is used only for loop iterations counting after
-///   this transformation.
-///
 /// - The header mask is replaced with a header mask based on the EVL.
 ///
 /// - Plans with FORs have a new phi added to keep track of the EVL of the
@@ -3270,12 +3304,9 @@ void VPlanTransforms::addExplicitVectorLength(
 
   auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
   auto *CanIVTy = LoopRegion->getCanonicalIVType();
-  VPValue *StartV = CanonicalIVPHI->getStartValue();
+  auto *CanonicalIVIncrement =
+      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
 
-  // Create the CurrentIteration recipe in the vector loop.
-  auto *CurrentIteration =
-      new VPCurrentIterationPHIRecipe(StartV, DebugLoc::getUnknown());
-  CurrentIteration->insertAfter(CanonicalIVPHI);
   VPBuilder Builder(Header, Header->getFirstNonPhi());
   // Create the AVL (application vector length), starting from TC -> 0 in steps
   // of EVL.
@@ -3292,52 +3323,27 @@ void VPlanTransforms::addExplicitVectorLength(
   }
   auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
                                      DebugLoc::getUnknown(), "evl");
-
-  auto *CanonicalIVIncrement =
-      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
-  Builder.setInsertPoint(CanonicalIVIncrement);
-  VPValue *OpVPEVL = VPEVL;
-
+  // Cast EVL to the canonical IV type.
   auto *I32Ty = Type::getInt32Ty(Plan.getContext());
-  OpVPEVL = Builder.createScalarZExtOrTrunc(
-      OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
-
-  auto *NextIter = Builder.createAdd(OpVPEVL, CurrentIteration,
-                                     CanonicalIVIncrement->getDebugLoc(),
-                                     "current.iteration.next",
-                                     {CanonicalIVIncrement->hasNoUnsignedWrap(),
-                                      CanonicalIVIncrement->hasNoSignedWrap()});
-  CurrentIteration->addOperand(NextIter);
+  VPValue *OpVPEVL = Builder.createScalarZExtOrTrunc(
+      VPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
 
+  addCurrentIterationPhi(Plan, OpVPEVL);
+  Builder.setInsertPoint(CanonicalIVIncrement);
   VPValue *NextAVL =
       Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
                         "avl.next", {/*NUW=*/true, /*NSW=*/false});
   AVLPhi->addOperand(NextAVL);
 
-  fixupVFUsersForEVL(Plan, *VPEVL);
   removeDeadRecipes(Plan);
 
-  // Replace all uses of VPCanonicalIVPHIRecipe by
-  // VPCurrentIterationPHIRecipe except for the canonical IV increment.
-  CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
-  CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
   // TODO: support unroll factor > 1.
   Plan.setUF(1);
 }
 
 void VPlanTransforms::convertToVariableLengthStep(VPlan &Plan) {
-  // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
-  // There should be only one VPCurrentIteration in the entire plan.
-  VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
-
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
-           vp_depth_first_shallow(Plan.getEntry())))
-    for (VPRecipeBase &R : VPBB->phis())
-      if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
-        assert(!CurrentIteration &&
-               "Found multiple CurrentIteration. Only one expected");
-        CurrentIteration = PhiR;
-      }
+  VPCurrentIterationPHIRecipe *CurrentIteration =
+      vputils::getCurrentIterationPhi(Plan);
 
   // Early return if it is not variable-length stepping.
   if (!CurrentIteration)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -285,11 +285,8 @@ struct VPlanTransforms {
       VPlan &Plan,
       const std::function<bool(BasicBlock *)> &BlockNeedsPredication);
 
-  /// Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
-  /// replaces all uses except the canonical IV increment of
-  /// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
-  /// VPCanonicalIVPHIRecipe is only used to control the loop after
-  /// this transformation.
+  /// Convert a tail-folded vector loop to use explicit vector length (EVL)
+  /// instead of VF elements each iteration.
   static void
   addExplicitVectorLength(VPlan &Plan,
                           const std::optional<unsigned> &MaxEVLSafeElements);

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -570,13 +570,13 @@ vputils::getRecipesForUncountableExit(VPlan &Plan,
 VPSingleDefRecipe *vputils::findHeaderMask(VPlan &Plan) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   SmallVector<VPValue *> WideCanonicalIVs;
-  auto *FoundWidenCanonicalIVUser = find_if(
-      LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>);
-  assert(count_if(LoopRegion->getCanonicalIV()->users(),
+  VPHeaderPHIRecipe *CurrentIteration = LoopRegion->getCurrentIteration();
+  auto *FoundWidenCanonicalIVUser =
+      find_if(CurrentIteration->users(), IsaPred<VPWidenCanonicalIVRecipe>);
+  assert(count_if(CurrentIteration->users(),
                   IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
          "Must have at most one VPWideCanonicalIVRecipe");
-  if (FoundWidenCanonicalIVUser !=
-      LoopRegion->getCanonicalIV()->users().end()) {
+  if (FoundWidenCanonicalIVUser != CurrentIteration->users().end()) {
     auto *WideCanonicalIV =
         cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
     WideCanonicalIVs.push_back(WideCanonicalIV);
@@ -665,3 +665,28 @@ VPInstruction *vputils::findComputeReductionResult(VPReductionPHIRecipe *PhiR) {
   return vputils::findUserOf<VPInstruction::ComputeReductionResult>(
       cast<VPSingleDefRecipe>(SelR));
 }
+
+VPCurrentIterationPHIRecipe *vputils::getCurrentIterationPhi(VPlan &Plan) {
+  VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  if (LoopRegion) {
+    VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
+    if (std::next(CanIV->getIterator()) != CanIV->getParent()->end())
+      CurrentIteration = dyn_cast_or_null<VPCurrentIterationPHIRecipe>(
+          std::next(CanIV->getIterator()));
+    return CurrentIteration;
+  }
+
+  // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
+  // There should be only one VPCurrentIteration in the entire plan.
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getEntry())))
+    for (VPRecipeBase &R : VPBB->phis())
+      if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
+        assert(!CurrentIteration &&
+               "Found multiple CurrentIteration. Only one expected");
+        CurrentIteration = PhiR;
+      }
+
+  return CurrentIteration;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -151,6 +151,8 @@ VPInstruction *findComputeReductionResult(VPReductionPHIRecipe *PhiR);
 /// the header-mask pattern manually.
 VPSingleDefRecipe *findHeaderMask(VPlan &Plan);
 
+/// Return VPCurrentIterationPHIRecipe if present, otherwise nullptr.
+VPCurrentIterationPHIRecipe *getCurrentIterationPhi(VPlan &Plan);
 } // namespace vputils
 
 //===----------------------------------------------------------------------===//

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -33,13 +33,13 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFBFMIN-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
+; ZVFBFMIN-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
 ; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
 ; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
 ; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 8 x bfloat> @llvm.vp.load.nxv8bf16.p0(ptr align 2 [[TMP1]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
 ; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = call <vscale x 8 x bfloat> @llvm.vp.load.nxv8bf16.p0(ptr align 2 [[TMP2]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
 ; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = fadd <vscale x 8 x bfloat> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; ZVFBFMIN-NEXT:    call void @llvm.vp.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[TMP11]], ptr align 2 [[TMP1]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
-; ZVFBFMIN-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
 ; ZVFBFMIN-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[TMP0]]
 ; ZVFBFMIN-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
@@ -125,6 +125,7 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFBFMIN-NEXT:    [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
 ; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP6]]
 ; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]]
 ; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]]
@@ -135,11 +136,10 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; ZVFBFMIN-NEXT:    [[TMP14:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD1]] to <vscale x 4 x float>
 ; ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD2]])
 ; ZVFBFMIN-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
-; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
 ; ZVFBFMIN-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[TMP6]]
 ; ZVFBFMIN-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; ZVFBFMIN-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; ZVFBFMIN-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
 ; ZVFBFMIN-NEXT:    br label %[[EXIT:.*]]
 ; ZVFBFMIN:       [[EXIT]]: