Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,18 @@ class SCEVAddRecForUniformityRewriter
SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
const SCEV *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
const SCEV *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset);
return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap);
// We have to be careful when creating new SCEVAddRec expressions because
// we may pick up a cached SCEV object with wrap flags already set. This
// then leads to random behaviour depending upon which combinations of
// offset, StepMultiplier and TheLoop are used. The safest thing we can do
// here is to reuse existing wrap flags on the scalar SCEV, since if the
// scalar version of the SCEV cannot wrap then the vector version also
// cannot. There are situations where the lane of the vector may exceed the
// trip count, such as tail-folding. In those cases we shouldn't even be
// asking if something is uniform anyway.
const SCEV *Res =
SE.getAddRecExpr(NewStart, NewStep, TheLoop, Expr->getNoWrapFlags());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure it is safe to use the no-wrap flags from the addrec with different start/step here, as the AddRec we create here must be valid for the original scalar loop as well because ScalarEvolution will cache it. Assume we don't vectorize and a later pass constructs the same AddRec and uses the cached variant with potentially incorrect flags.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, the alternatives seem to me to be:

  1. Disable this optimisation entirely, because it seems like bad behaviour to insert rogue SCEV nodes into the cache that shouldn't be there. Any later analysis that picks up these cached nodes may be surprised that there are no wrapping flags set when the analysis says that they shouldn't in fact wrap (perhaps because the IR nodes themselves have nsw/nuw on them). Or,
  2. Have a way of preventing ScalarEvolution from inserting nodes into the cache while performing these rewrites.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed #169576 landed 2 days ago! So perhaps it has fixed it, but I'd like to check. I'm still worried about inserting rogue nodes into the cache.

return Res;
}

const SCEV *visit(const SCEV *S) {
Expand Down Expand Up @@ -554,7 +565,6 @@ class SCEVAddRecForUniformityRewriter
SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
TheLoop);
const SCEV *Result = Rewriter.visit(S);

if (Rewriter.canAnalyze())
return Result;
return SE.getCouldNotCompute();
Expand All @@ -566,6 +576,8 @@ class SCEVAddRecForUniformityRewriter
bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
if (isInvariant(V))
return true;
// TODO: Even for scalable vectors we can use the maximum value of vscale
// to estimate the maximum possible lane.
if (VF.isScalable())
return false;
if (VF.isScalar())
Expand Down Expand Up @@ -605,7 +617,16 @@ bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
// stores from being uniform. The current lowering simply doesn't handle
// it; in particular, the cost model distinguishes scatter/gather from
// scalar w/predication, and we currently rely on the scalar path.
return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent());
// NOTE: Loops with uncountable early exits may have vectors whose lanes
// exceed the exit point so it's unsafe to reason about uniformity.
// FIXME: What about tail-folding? I think there is a serious bug here.
// We cannot reason about uniformity when tail-folding because in the last
// vector iteration the pointer calculations are being performed beyond
// the end of the loop. The function isUniformMemOp assumes that the
// calculations are only being performed up to the actual exit point
// because it passes in the scalar loop pointer.
return !hasUncountableEarlyExit() && !blockNeedsPredication(I.getParent()) &&
isUniform(Ptr, VF);
}

bool LoopVectorizationLegality::canVectorizeOuterLoop() {
Expand Down
82 changes: 82 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/uniform-scev-rewrite.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 6
; RUN: opt -S -p loop-vectorize -mattr=+sve -mtriple=aarch64-linux-gnu < %s | FileCheck %s

define void @foo(ptr %p1, ptr %p2, i32 %n) vscale_range(1,16) {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 6
; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP1]], 67108863
; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP2]], 67108863
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[P1]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i64, ptr [[P1]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 63)
; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i64> [[STEP_ADD]], splat (i64 63)
; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa [[LONG_TBAA0:![0-9]+]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8, !tbaa [[LONG_TBAA0]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = lshr <4 x i64> [[BROADCAST_SPLAT]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = lshr <4 x i64> [[BROADCAST_SPLAT2]], [[TMP8]]
; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i64> [[TMP11]] to <4 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32>
; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i32> [[TMP13]], splat (i32 1)
; CHECK-NEXT: [[TMP16:%.*]] = and <4 x i32> [[TMP14]], splat (i32 1)
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[P2]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i64 4
; CHECK-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP17]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
; CHECK-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP18]], align 4, !tbaa [[INT_TBAA4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[END:label %.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
;
entry:
%wide.trip.count = zext nneg i32 %n to i64
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%lsr = lshr i64 %iv, 6
%lsr.zext = and i64 %lsr, 67108863
%gep.load = getelementptr inbounds nuw i64, ptr %p1, i64 %lsr.zext
%and = and i64 %iv, 63
%load = load i64, ptr %gep.load, align 8, !tbaa !50
%lsr2 = lshr i64 %load, %and
%lsr2.zext = trunc i64 %lsr2 to i32
%val = and i32 %lsr2.zext, 1
%gep.store = getelementptr inbounds nuw i32, ptr %p2, i64 %iv
store i32 %val, ptr %gep.store, align 4, !tbaa !7
%iv.next = add nuw nsw i64 %iv, 1
%icmp = icmp eq i64 %iv.next, %wide.trip.count
br i1 %icmp, label %end, label %loop

end:
ret void
}

!7 = !{!8, !8, i64 0}
!8 = !{!"int", !9, i64 0}
!9 = !{!"omnipotent char", !10, i64 0}
!10 = !{!"Simple C++ TBAA"}
!20 = !{!"long", !9, i64 0}
!50 = !{!20, !20, i64 0}