llvm · david-arm · Dec 3, 2025 · fhahn · Dec 4, 2025 · david-arm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -517,7 +517,18 @@ class SCEVAddRecForUniformityRewriter
         SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
     const SCEV *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
     const SCEV *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset);
-    return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap);
+    // We have to be careful when creating new SCEVAddRec expressions because
+    // we may pick up a cached SCEV object with wrap flags already set. This
+    // then leads to random behaviour depending upon which combinations of
+    // offset, StepMultiplier and TheLoop are used. The safest thing we can do
+    // here is to reuse existing wrap flags on the scalar SCEV, since if the
+    // scalar version of the SCEV cannot wrap then the vector version also
+    // cannot. There are situations where the lane of the vector may exceed the
+    // trip count, such as tail-folding. In those cases we shouldn't even be
+    // asking if something is uniform anyway.
+    const SCEV *Res =
+        SE.getAddRecExpr(NewStart, NewStep, TheLoop, Expr->getNoWrapFlags());
+    return Res;
   }
 
   const SCEV *visit(const SCEV *S) {
@@ -554,7 +565,6 @@ class SCEVAddRecForUniformityRewriter
     SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
                                              TheLoop);
     const SCEV *Result = Rewriter.visit(S);
-
     if (Rewriter.canAnalyze())
       return Result;
     return SE.getCouldNotCompute();
@@ -566,6 +576,8 @@ class SCEVAddRecForUniformityRewriter
 bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
   if (isInvariant(V))
     return true;
+  // TODO: Even for scalable vectors we can use the maximum value of vscale
+  // to estimate the maximum possible lane.
   if (VF.isScalable())
     return false;
   if (VF.isScalar())
@@ -605,7 +617,16 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
   // stores from being uniform.  The current lowering simply doesn't handle
   // it; in particular, the cost model distinguishes scatter/gather from
   // scalar w/predication, and we currently rely on the scalar path.
-  return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent());
+  // NOTE: Loops with uncountable early exits may have vectors whose lanes
+  // exceed the exit point so it's unsafe to reason about uniformity.
+  // FIXME: What about tail-folding? I think there is a serious bug here.
+  // We cannot reason about uniformity when tail-folding because in the last
+  // vector iteration the pointer calculations are being performed beyond
+  // the end of the loop. The function isUniformMemOp assumes that the
+  // calculations are only being performed up to the actual exit point
+  // because it passes in the scalar loop pointer.
+  return !hasUncountableEarlyExit() && !blockNeedsPredication(I.getParent()) &&
+         isUniform(Ptr, VF);
 }
 
 bool LoopVectorizationLegality::canVectorizeOuterLoop() {

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-scev-rewrite.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-scev-rewrite.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 6
+; RUN: opt -S -p loop-vectorize -mattr=+sve -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+define void @foo(ptr %p1, ptr %p2, i32 %n) vscale_range(1,16) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP1]], 67108863
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP2]], 67108863
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[P1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i64, ptr [[P1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 63)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i64> [[STEP_ADD]], splat (i64 63)
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8, !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <4 x i64> [[BROADCAST_SPLAT]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <4 x i64> [[BROADCAST_SPLAT2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i64> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i32> [[TMP13]], splat (i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = and <4 x i32> [[TMP14]], splat (i32 1)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i64 4
+; CHECK-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP17]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
+; CHECK-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP18]], align 4, !tbaa [[INT_TBAA4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[END:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %lsr = lshr i64 %iv, 6
+  %lsr.zext = and i64 %lsr, 67108863
+  %gep.load = getelementptr inbounds nuw i64, ptr %p1, i64 %lsr.zext
+  %and = and i64 %iv, 63
+  %load = load i64, ptr %gep.load, align 8, !tbaa !50
+  %lsr2 = lshr i64 %load, %and
+  %lsr2.zext = trunc i64 %lsr2 to i32
+  %val = and i32 %lsr2.zext, 1
+  %gep.store = getelementptr inbounds nuw i32, ptr %p2, i64 %iv
+  store i32 %val, ptr %gep.store, align 4, !tbaa !7
+  %iv.next = add nuw nsw i64 %iv, 1
+  %icmp = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %icmp, label %end, label %loop
+
+end:
+  ret void
+}
+
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
+!20 = !{!"long", !9, i64 0}
+!50 = !{!20, !20, i64 0}