diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f2e9c3146b0e8..caa9490565d87 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -517,7 +517,18 @@ class SCEVAddRecForUniformityRewriter SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier)); const SCEV *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset)); const SCEV *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset); - return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap); + // We have to be careful when creating new SCEVAddRec expressions because + // we may pick up a cached SCEV object with wrap flags already set. This + // then leads to random behaviour depending upon which combinations of + // offset, StepMultiplier and TheLoop are used. The safest thing we can do + // here is to reuse existing wrap flags on the scalar SCEV, since if the + // scalar version of the SCEV cannot wrap then the vector version also + // cannot. There are situations where the lane of the vector may exceed the + // trip count, such as tail-folding. In those cases we shouldn't even be + // asking if something is uniform anyway. + const SCEV *Res = + SE.getAddRecExpr(NewStart, NewStep, TheLoop, Expr->getNoWrapFlags()); + return Res; } const SCEV *visit(const SCEV *S) { @@ -554,7 +565,6 @@ class SCEVAddRecForUniformityRewriter SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset, TheLoop); const SCEV *Result = Rewriter.visit(S); - if (Rewriter.canAnalyze()) return Result; return SE.getCouldNotCompute(); @@ -566,6 +576,8 @@ class SCEVAddRecForUniformityRewriter bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { if (isInvariant(V)) return true; + // TODO: Even for scalable vectors we can use the maximum value of vscale + // to estimate the maximum possible lane. if (VF.isScalable()) return false; if (VF.isScalar()) @@ -605,7 +617,16 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, // stores from being uniform. The current lowering simply doesn't handle // it; in particular, the cost model distinguishes scatter/gather from // scalar w/predication, and we currently rely on the scalar path. - return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent()); + // NOTE: Loops with uncountable early exits may have vectors whose lanes + // exceed the exit point so it's unsafe to reason about uniformity. + // FIXME: What about tail-folding? I think there is a serious bug here. + // We cannot reason about uniformity when tail-folding because in the last + // vector iteration the pointer calculations are being performed beyond + // the end of the loop. The function isUniformMemOp assumes that the + // calculations are only being performed up to the actual exit point + // because it passes in the scalar loop pointer. + return !hasUncountableEarlyExit() && !blockNeedsPredication(I.getParent()) && + isUniform(Ptr, VF); } bool LoopVectorizationLegality::canVectorizeOuterLoop() { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-scev-rewrite.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-scev-rewrite.ll new file mode 100644 index 0000000000000..d29a22f3802f0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-scev-rewrite.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 6 +; RUN: opt -S -p loop-vectorize -mattr=+sve -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define void @foo(ptr %p1, ptr %p2, i32 %n) vscale_range(1,16) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 6 +; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP1]], 67108863 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP2]], 67108863 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[P1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i64, ptr [[P1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 63) +; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i64> [[STEP_ADD]], splat (i64 63) +; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa [[LONG_TBAA0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8, !tbaa [[LONG_TBAA0]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = lshr <4 x i64> [[BROADCAST_SPLAT]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = lshr <4 x i64> [[BROADCAST_SPLAT2]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i64> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i32> [[TMP13]], splat (i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = and <4 x i32> [[TMP14]], splat (i32 1) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i64 4 +; CHECK-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP17]], align 4, !tbaa [[INT_TBAA4:![0-9]+]] +; CHECK-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP18]], align 4, !tbaa [[INT_TBAA4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4) +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[END:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %wide.trip.count = zext nneg i32 %n to i64 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %lsr = lshr i64 %iv, 6 + %lsr.zext = and i64 %lsr, 67108863 + %gep.load = getelementptr inbounds nuw i64, ptr %p1, i64 %lsr.zext + %and = and i64 %iv, 63 + %load = load i64, ptr %gep.load, align 8, !tbaa !50 + %lsr2 = lshr i64 %load, %and + %lsr2.zext = trunc i64 %lsr2 to i32 + %val = and i32 %lsr2.zext, 1 + %gep.store = getelementptr inbounds nuw i32, ptr %p2, i64 %iv + store i32 %val, ptr %gep.store, align 4, !tbaa !7 + %iv.next = add nuw nsw i64 %iv, 1 + %icmp = icmp eq i64 %iv.next, %wide.trip.count + br i1 %icmp, label %end, label %loop + +end: + ret void +} + +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C++ TBAA"} +!20 = !{!"long", !9, i64 0} +!50 = !{!20, !20, i64 0}