llvm · MacDue · Jan 20, 2026 · Feb 16, 2026 · Feb 18, 2026
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -300,9 +300,17 @@ class LoopVectorizationLegality {
   /// masking.
   bool canFoldTailByMasking() const;
 
+  /// Returns true if all instructions in the loop support masking or
+  /// speculation.
+  ///
+  /// The mask may be loop-invariant if it represents a maximum safe dependence
+  /// distance (alias mask) or loop-variant if it is based on the induction
+  /// variable (e.g. tail-folding).
+  bool canMaskLoop() const;
+
   /// Mark all respective loads/stores for masking. Must only be called when
-  /// tail-folding is possible.
-  void prepareToFoldTailByMasking();
+  /// masking is possible.
+  void prepareToMaskLoop();
 
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
@@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
     return (ScalarOpdIdx == 2);
   case Intrinsic::experimental_vp_splice:
     return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+  case Intrinsic::loop_dependence_war_mask:
+    return true;
   default:
     return false;
   }

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -2131,6 +2131,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
     }
   }
 
+  if (!canMaskLoop())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canMaskLoop() const {
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
@@ -2139,17 +2148,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
   SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
   for (BasicBlock *BB : TheLoop->blocks()) {
     if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {
-      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
+      LLVM_DEBUG(dbgs() << "LV: Cannot mask loop.\n");
       return false;
     }
   }
 
-  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
-
   return true;
 }
 
-void LoopVectorizationLegality::prepareToFoldTailByMasking() {
+void LoopVectorizationLegality::prepareToMaskLoop() {
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -668,6 +668,10 @@ class LoopVectorizationPlanner {
   void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
                            bool HasBranchWeights) const;
 
+  VPValue *materializeAliasMask(VPlan &Plan,
+                                ArrayRef<PointerDiffInfo> DiffChecks,
+                                bool HasBranchWeights);
+
 #ifndef NDEBUG
   /// \return The most profitable vectorization factor for the available VPlans
   /// and the cost of that VF.

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsPartialAliasVectorized,
+          "Number of partial aliasing loops vectorized");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -198,6 +200,10 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
+static cl::opt<bool> ForcePartialAliasingVectorization(
+    "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Replace pointer diff checks with alias masks."));
+
 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 // that predication is preferred, and this lists all options. I.e., the
 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1386,6 +1392,47 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  /// Returns true if all loop blocks should be masked.
+  bool allLoopBlocksMasked() const {
+    return foldTailByMasking() || maskPartialAliasing();
+  }
+
+  void checkIfPartialAliasMaskingIsEnabled() {
+    assert(!IsPartialAliasMaskingEnabled &&
+           "Partial alias masking already checked!");
+    if (!ForcePartialAliasingVectorization || !Legal->canMaskLoop() ||
+        !Legal->getFixedOrderRecurrences().empty()) {
+      // Option not enabled (or loop cannot be masked).
+      // Note: FixedOrderRecurrences are not supported yet as we cannot handle
+      // the required `splice.right` with the alias-mask.
+      IsPartialAliasMaskingEnabled = false;
+      return;
+    }
+    const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
+    if (!Checks) {
+      // Runtime checks not needed for this loop (no alias mask required).
+      IsPartialAliasMaskingEnabled = false;
+      return;
+    }
+    if (auto DiffChecks = Checks->getDiffChecks()) {
+      // We have diff checks. We can use an alias mask.
+      IsPartialAliasMaskingEnabled = !DiffChecks->empty();
+      return;
+    }
+    // Runtime checks are not diff checks (can't be replaced with alias mask).
+    IsPartialAliasMaskingEnabled = false;
+  }
+
+  void disablePartialAliasMaskingIfEnabled() {
+    if (IsPartialAliasMaskingEnabled)
+      IsPartialAliasMaskingEnabled = false;
+  }
+
+  /// Returns true if all loop blocks should have partial aliases masked.
+  bool maskPartialAliasing() const {
+    return IsPartialAliasMaskingEnabled.value_or(false);
+  }
+
   /// Returns true if the use of wide lane masks is requested and the loop is
   /// using tail-folding with a lane mask for control flow.
   bool useWideActiveLaneMask() const {
@@ -1410,7 +1457,7 @@ class LoopVectorizationCostModel {
   /// for any reason, e.g. because tail folding now requires a predicate
   /// or because the block in the original loop was predicated.
   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
-    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
+    return allLoopBlocksMasked() || Legal->blockNeedsPredication(BB);
   }
 
   /// Returns true if VP intrinsics with explicit vector length support should
@@ -1604,6 +1651,9 @@ class LoopVectorizationCostModel {
   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
       ChosenTailFoldingStyle;
 
+  /// true if partial alias masking is enabled (nullopt = undecided).
+  std::optional<bool> IsPartialAliasMaskingEnabled;
+
   /// true if scalable vectorization is supported and enabled.
   std::optional<bool> IsScalableVectorizationAllowed;
 
@@ -1825,14 +1875,18 @@ class GeneratedRTChecks {
   /// The kind of cost that we are calculating
   TTI::TargetCostKind CostKind;
 
+  /// True if the loop is alias-masked (which allows us to omit diff checks).
+  bool LoopUsesAliasMasking = false;
+
 public:
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
-                    TTI::TargetCostKind CostKind)
+                    TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
       : DT(DT), LI(LI), TTI(TTI),
         SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
         MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
-        PSE(PSE), CostKind(CostKind) {}
+        PSE(PSE), CostKind(CostKind),
+        LoopUsesAliasMasking(LoopUsesAliasMasking) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -1885,7 +1939,7 @@ class GeneratedRTChecks {
     }
 
     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
-    if (RtPtrChecking.Need) {
+    if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
                                  "vector.memcheck");
@@ -2883,8 +2937,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
   if (Legal->blockNeedsPredication(I->getParent()))
     return true;
 
-  // If we're not folding the tail by masking, predication is unnecessary.
-  if (!foldTailByMasking())
+  // If we're not masking, predication is unnecessary.
+  if (!allLoopBlocksMasked())
     return false;
 
   // All that remain are instructions with side-effects originally executed in
@@ -3088,10 +3142,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   auto *Ptr = getLoadStorePointerOperand(I);
   auto *ScalarTy = getLoadStoreType(I);
 
+  int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
   // In order to be widened, the pointer should be consecutive, first of all.
-  if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
+  if (!Stride)
     return false;
 
+  // Currently, we can't handle alias masking in reverse. Reversing the alias
+  // mask is not correct (or necessary). When combined with tail-folding the ALM
+  // should only be reversed where the alias-mask is true.
+  if (Stride < 0)
+    disablePartialAliasMaskingIfEnabled();
+
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I, VF))
@@ -3613,6 +3674,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
+  checkIfPartialAliasMaskingIsEnabled();
+
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
     return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false);
@@ -4451,6 +4514,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     return Result;
   }
 
+  if (CM.maskPartialAliasing()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LEV: Epilogue vectorization not supported with alias masking");
+    return Result;
+  }
+
   // Not really a cost consideration, but check for unsupported cases here to
   // simplify the logic.
   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
@@ -5729,7 +5799,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           // stores.  Note that even with tail folding we know that at least
           // one lane is active (i.e. generalized predication is not possible
           // here), and the logic below depends on this fact.
-          if (!foldTailByMasking())
+          if (!allLoopBlocksMasked())
             return true;
 
           // For scalable vectors, a uniform memop load is always
@@ -6824,8 +6894,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.invalidateCostModelingDecisions();
   }
 
-  if (CM.foldTailByMasking())
-    Legal->prepareToFoldTailByMasking();
+  if (CM.allLoopBlocksMasked())
+    Legal->prepareToMaskLoop();
 
   ElementCount MaxUserVF =
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
@@ -6937,7 +7007,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     // TODO: Remove this code after stepping away from the legacy cost model and
     // adding code to simplify VPlans before calculating their costs.
     auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
-    if (TC == VF && !CM.foldTailByMasking())
+    if (TC == VF && !CM.allLoopBlocksMasked())
       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
                                            CostCtx.SkipCostComputation);
 
@@ -7431,6 +7501,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // compactness.
   attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
 
+  VPValue *ClampedVF = nullptr;
+  if (CM.maskPartialAliasing()) {
+    ClampedVF = materializeAliasMask(
+        BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
+        HasBranchWeights);
+    ++LoopsPartialAliasVectorized;
+  }
+
   // Retrieving VectorPH now when it's easier while VPlan still has Regions.
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
 
@@ -7467,6 +7545,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPlanTransforms::materializeVectorTripCount(
       BestVPlan, VectorPH, CM.foldTailByMasking(),
       CM.requiresScalarEpilogue(BestVF.isVector()));
+  VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF);
   VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
   VPlanTransforms::cse(BestVPlan);
   VPlanTransforms::simplifyRecipes(BestVPlan);
@@ -8237,7 +8316,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
   RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize, *Plan,
-                           CM.foldTailByMasking());
+                           CM.foldTailByMasking(), CM.maskPartialAliasing());
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
@@ -8483,9 +8562,9 @@ void LoopVectorizationPlanner::addReductionResultComputation(
     // with fewer lanes than the VF. So the operands of the select would have
     // different numbers of lanes. Partial reductions mask the input instead.
     auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
-    if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
+    if (!PhiR->isInLoop() && CM.allLoopBlocksMasked() &&
         (!RR || !RR->isPartialReduction())) {
-      VPValue *Cond = vputils::findHeaderMask(*Plan);
+      VPValue *Cond = vputils::findLoopBodyMask(*Plan);
       VPIRFlags Flags = PhiTy->isFloatingPointTy()
                             ? VPIRFlags(RdxDesc.getFastMathFlags())
                             : VPIRFlags();
@@ -8688,6 +8767,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
   }
 }
 
+VPValue *LoopVectorizationPlanner::materializeAliasMask(
+    VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
+  VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check");
+  VPValue *ClampedVF = VPlanTransforms::materializeAliasMask(
+      Plan, MinVFCheck,
+      *CM.Legal->getRuntimePointerChecking()->getDiffChecks());
+  VPBuilder Builder(MinVFCheck);
+  Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+  // Check the "ClampedVF" from the alias mask contains at least two elements.
+  VPValue *Cond = Builder.createICmp(
+      CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf");
+  VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights);
+  return ClampedVF;
+}
+
 void LoopVectorizationPlanner::addMinimumIterationCheck(
     VPlan &Plan, ElementCount VF, unsigned UF,
     ElementCount MinProfitableTripCount) const {
@@ -8800,7 +8894,8 @@ static bool processLoopInVPlanNativePath(
   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
 
   {
-    GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+    GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+                             CM.maskPartialAliasing());
     InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
                            Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9649,7 +9744,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+                           CM.maskPartialAliasing());
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
@@ -9768,6 +9864,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     IC = 1;
   }
 
+  if (CM.maskPartialAliasing()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Not interleaving due to partial aliasing vectorization.\n");
+    IntDiagMsg = {
+        "PartialAliasingVectorization",
+        "Unable to interleave due to partial aliasing vectorization."};
+    InterleaveLoop = false;
+    IC = 1;
+  }
+
   // Emit diagnostic messages, if any.
   const char *VAPassName = Hints.vectorizeAnalysisPassName();
   if (!VectorizeLoop && !InterleaveLoop) {