llvm · artagnon · Mar 13, 2026
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1211,7 +1211,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     ActiveLaneMask,
     ExplicitVectorLength,
     CalculateTripCountMinusVF,
-    // Increment the canonical IV separately for each unrolled part.
+    // Increment the canonical IV separately for each unrolled part. Meant to be
+    // constructed with two operands, namely the Start value and VF. Unrolling
+    // and conversion to concrete recipes add an extra Offset operand and the
+    // recipe produces `add Start, Offset`. The offset for unrolled part 0 is 0.
     CanonicalIVIncrementForPart,
     // Abstract instruction that compares two values and branches. This is
     // lowered to ICmp + BranchOnCond during VPlan to VPlan transformation.
@@ -2192,8 +2195,9 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags {
 };
 
 /// A recipe to compute the pointers for widened memory accesses of \p
-/// SourceElementTy. Unrolling adds an extra offset operand for unrolled parts >
-/// 0 and it produces `GEP Ptr, Offset`. The offset for unrolled part 0 is 0.
+/// SourceElementTy. Unrolling and conversion to concrete recipes add an extra
+/// offset operand and the recipe produces `GEP Ptr, Offset`. The offset for
+/// unrolled part 0 is 0.
 class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
   Type *SourceElementTy;
 

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -866,6 +866,17 @@ inline auto m_c_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_c_Select(Op0, m_True(), Op1);
 }
 
+inline auto m_CanonicalIVIncrement() {
+  return VPInstruction_match<VPInstruction::CanonicalIVIncrementForPart>();
+}
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline auto m_CanonicalIVIncrement(const Op0_t &Op0, const Op1_t &Op1,
+                                   const Op2_t &Op2) {
+  return VPInstruction_match<VPInstruction::CanonicalIVIncrementForPart, Op0_t,
+                             Op1_t, Op2_t>(Op0, Op1, Op2);
+}
+
 inline auto m_CanonicalIV() { return class_match<VPCanonicalIVPHIRecipe>(); }
 
 template <typename Op0_t, typename Op1_t, typename Op2_t>

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -656,8 +656,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
+    assert(getNumOperands() == 3 &&
+           "Expected prior simplification of recipe without offset");
     auto *IV = State.get(getOperand(0), VPLane(0));
-    auto *VFxPart = State.get(getOperand(1), VPLane(0));
+    auto *VFxPart = State.get(getOperand(2), VPLane(0));
     // The canonical IV is incremented by the vectorization factor (num of
     // SIMD elements) times the unroll part.
     return Builder.CreateAdd(IV, VFxPart, Name, hasNoUnsignedWrap(),

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1620,12 +1620,16 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
     }
   }
 
-  // Simplify unrolled VectorPointer without offset, or with zero offset, to
-  // just the pointer operand.
+  // Simplify unrolled VectorPointer with no offset or zero offset.
   if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
     if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
       return VPR->replaceAllUsesWith(VPR->getOperand(0));
 
+  // Simplify unrolled CanonicalIVIncrement with no offset or zero offset.
+  if (match(Def, m_CanonicalIVIncrement()))
+    if (Def->getNumOperands() == 2 || match(Def->getOperand(2), m_ZeroInt()))
+      return Def->replaceAllUsesWith(Def->getOperand(0));
+
   // VPScalarIVSteps after unrolling can be replaced by their start value, if
   // the start index is zero and only the first lane 0 is demanded.
   if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
@@ -2064,18 +2068,18 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
           m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));
     assert(Index && "Expected index from ActiveLaneMask instruction");
 
-    uint64_t Part;
     if (match(Index,
-              m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
-                  m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
-      Phis[Part] = Phi;
-    else {
-      // Anything other than a CanonicalIVIncrementForPart is part 0
-      assert(!match(
-          Index,
-          m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()));
+              m_CanonicalIVIncrement(m_VPValue(), m_VPValue(), m_ZeroInt()))) {
+      // Part 0 CanonicalIVIncrement.
       Phis[0] = Phi;
+      continue;
     }
+    uint64_t Part;
+    VPValue *VF;
+    assert(match(Index, m_CanonicalIVIncrement(
+                            m_VPValue(), m_VPValue(VF),
+                            m_c_Mul(m_Deferred(VF), m_ConstantInt(Part)))));
+    Phis[Part] = Phi;
   }
 
   assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
@@ -3986,6 +3990,24 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
         ToRemove.push_back(Blend);
       }
 
+      // Materialize Part0 offsets for various recipes when UF = 1. When UF > 1,
+      // the unroller would have added Part0 offsets.
+      if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) {
+        if (!VPR->getOffset()) {
+          assert(Plan.getConcreteUF() == 1 &&
+                 "Expected unroller to have materialized offset for UF != 1");
+          VPR->addOperand(Plan.getZero(Plan.getDataLayout().getIndexType(
+              TypeInfo.inferScalarType(VPR))));
+        }
+      }
+      if (match(&R, m_CanonicalIVIncrement())) {
+        if (R.getNumOperands() == 2) {
+          assert(Plan.getConcreteUF() == 1 &&
+                 "Expected unroller to have materialized offset for UF != 1");
+          R.addOperand(
+              Plan.getZero(Plan.getVectorLoopRegion()->getCanonicalIVType()));
+        }
+      }
       if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
         if (!VEPR->getOffset()) {
           assert(Plan.getConcreteUF() == 1 &&

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -337,8 +337,8 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
     }
     if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) {
       VPBuilder Builder(VPR);
-      const DataLayout &DL = Plan.getDataLayout();
-      Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR));
+      Type *IndexTy =
+          Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR));
       Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF());
       VPValue *VF = Builder.createScalarZExtOrTrunc(
           &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown());
@@ -350,6 +350,16 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
       Copy->addOperand(VFxPart);
       continue;
     }
+    if (match(&R, m_CanonicalIVIncrement())) {
+      VPBuilder Builder(&R);
+      VPValue *VFxPart = Builder.createOverflowingOp(
+          Instruction::Mul, {&Plan.getVF(), getConstantInt(Part)},
+          {true, true});
+      Copy->setOperand(0, R.getOperand(0));
+      Copy->setOperand(1, R.getOperand(1));
+      Copy->addOperand(VFxPart);
+      continue;
+    }
     if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
       auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
       if (Phi && Phi->isOrdered()) {
@@ -379,19 +389,15 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
     // requiring it.
     if (isa<VPWidenCanonicalIVRecipe>(Copy))
       Copy->addOperand(getConstantInt(Part));
-
-    if (match(Copy,
-              m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {
-      VPBuilder Builder(Copy);
-      VPValue *ScaledByPart = Builder.createOverflowingOp(
-          Instruction::Mul, {Copy->getOperand(1), getConstantInt(Part)});
-      Copy->setOperand(1, ScaledByPart);
-    }
   }
-  if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
-    // Materialize Part0 offset for VectorEndPointer.
+  // Materialize Part0 offset for various recipes.
+  if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R))
+    VPR->addOperand(Plan.getZero(
+        Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR))));
+  if (match(&R, m_CanonicalIVIncrement()))
+    R.addOperand(getConstantInt(0));
+  if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R))
     VEPR->materializeOffset();
-  }
 }
 
 void UnrollState::unrollBlock(VPBlockBase *VPB) {
@@ -488,20 +494,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
   assert(UF > 0 && "Unroll factor must be positive");
   Plan.setUF(UF);
   llvm::scope_exit Cleanup([&Plan, UF]() {
-    auto Iter = vp_depth_first_deep(Plan.getEntry());
-    // Remove recipes that are redundant after unrolling.
-    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
-      for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-        auto *VPI = dyn_cast<VPInstruction>(&R);
-        if (VPI &&
-            VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
-            VPI->getOperand(1) == &Plan.getVF()) {
-          VPI->replaceAllUsesWith(VPI->getOperand(0));
-          VPI->eraseFromParent();
-        }
-      }
-    }
-
     Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
     Plan.getUF().replaceAllUsesWith(Plan.getConstantInt(TCTy, UF));
   });

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -403,8 +403,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
   VPlan *Plan = VPBB ? VPBB->getPlan() : nullptr;
   if (VPBB &&
       (VPBB == Plan->getVectorPreheader() || VPBB == Plan->getEntry())) {
-    if (match(V->getDefiningRecipe(),
-              m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
+    if (match(V->getDefiningRecipe(), m_CanonicalIVIncrement()))
       return false;
     return all_of(R->operands(), isUniformAcrossVFsAndUFs);
   }

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -304,10 +304,10 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
+; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP1]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
@@ -1320,10 +1320,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
+; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP1]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
@@ -1572,10 +1572,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
+; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP1]], 3
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -13,10 +13,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP61]], 2
 ; CHECK-NEXT:    [[TMP62:%.*]] = shl nuw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
-; CHECK-NEXT:    [[TMP28:%.*]] = shl i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
@@ -79,10 +79,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
-; CHECK-NEXT:    [[TMP28:%.*]] = shl i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])