[AArch64][SVE] Use loop.dependence.war.mask in vector.memcheck#175943
[AArch64][SVE] Use loop.dependence.war.mask in vector.memcheck#175943
Conversation
|
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis patch updates Using the last element of a The current checks are equivalent to: Whereas using Note: On AArch64 with SVE2 this allows the diff checks to be lowered to: Note: This is similar to https://reviews.llvm.org/D138652 Patch is 39.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175943.diff 8 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 74857a5b83aba..939486630b50b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -858,6 +858,9 @@ class TargetTransformInfoImplBase {
switch (ICA.getID()) {
default:
break;
+ case Intrinsic::loop_dependence_raw_mask:
+ case Intrinsic::loop_dependence_war_mask:
+ return 10;
case Intrinsic::allow_runtime_check:
case Intrinsic::allow_ubsan_check:
case Intrinsic::annotation:
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 42ed3c659edd9..9eed80f810d92 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -612,9 +612,12 @@ addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
SCEVExpander &Expander, bool HoistRuntimeChecks = false);
-LLVM_ABI Value *addDiffRuntimeChecks(
- Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
- function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC);
+LLVM_ABI Value *addDiffRuntimeChecks(Instruction *Loc,
+ ArrayRef<PointerDiffInfo> Checks,
+ SCEVExpander &Expander, ElementCount VF,
+ unsigned IC,
+ const TargetTransformInfo &TTI,
+ TTI::TargetCostKind CostKind);
/// Struct to hold information about a partially invariant condition.
struct IVConditionInfo {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 85be8db9d3ae2..59f782b986a4f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1071,9 +1071,15 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
EVT VecVT = getTLI()->getValueType(DL, RetTy);
unsigned EltSizeInBytes =
cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
- if (is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) &&
- VecVT.getVectorMinNumElements() == (16 / EltSizeInBytes))
- return 1;
+ if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
+ VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
+ break;
+ InstructionCost Cost = 1;
+ // For fixed-vector types at least a MOV and XTN are needed to convert
+ // from the predicate to a fixed-length mask.
+ if (isa<FixedVectorType>(RetTy))
+ Cost += 2;
+ return Cost;
}
break;
}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 749ff98ad0066..db5a61f6b31e4 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2136,38 +2136,85 @@ Value *llvm::addRuntimeChecks(
return MemoryRuntimeCheck;
}
-Value *llvm::addDiffRuntimeChecks(
- Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
- function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {
+Value *llvm::addDiffRuntimeChecks(Instruction *Loc,
+ ArrayRef<PointerDiffInfo> Checks,
+ SCEVExpander &Expander, ElementCount VF,
+ unsigned IC, const TargetTransformInfo &TTI,
+ TTI::TargetCostKind CostKind) {
LLVMContext &Ctx = Loc->getContext();
IRBuilder ChkBuilder(Ctx, InstSimplifyFolder(Loc->getDataLayout()));
ChkBuilder.SetInsertPoint(Loc);
+ Value *RuntimeVF = nullptr;
// Our instructions might fold to a constant.
Value *MemoryRuntimeCheck = nullptr;
-
auto &SE = *Expander.getSE();
+
+ constexpr Intrinsic::ID LoopDeskMaskIID = Intrinsic::loop_dependence_war_mask;
+ auto LoopDepMaskIsCheap = [&](Type *MaskTy, Value *AccessSize) {
+ Value *NullPtr = ConstantPointerNull::get(PointerType::getUnqual(Ctx));
+ // The pointer values should not change the cost. The access size (constant)
+ // is needed to by targets to cost the mask.
+ IntrinsicCostAttributes ICA(LoopDeskMaskIID, MaskTy,
+ {NullPtr, NullPtr, AccessSize});
+ InstructionCost Cost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+ return Cost.isValid() && Cost <= 1;
+ };
+
// Map to keep track of created compares, The key is the pair of operands for
// the compare, to allow detecting and re-using redundant compares.
DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
+ Value *IsConflict;
+ Module *M = Loc->getModule();
Type *Ty = SinkStart->getType();
- // Compute VF * IC * AccessSize.
- auto *VFTimesICTimesSize =
- ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
- ConstantInt::get(Ty, IC * AccessSize));
- Value *Diff =
- Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
-
- // Check if the same compare has already been created earlier. In that case,
- // there is no need to check it again.
- Value *IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
- if (IsConflict)
- continue;
+ Type *CheckTy = ChkBuilder.getIntNTy(Ty->getScalarSizeInBits());
+
+ VectorType *MaskTy = VectorType::get(ChkBuilder.getInt1Ty(), VF * IC);
+ Value *LoopAccessSize = ChkBuilder.getInt64(AccessSize);
+ if (!LoopDepMaskIsCheap(MaskTy, LoopAccessSize)) {
+ // Compute VF * IC * AccessSize.
+ if (!RuntimeVF)
+ RuntimeVF = ChkBuilder.CreateElementCount(CheckTy, VF);
+ auto *VFTimesICTimesSize = ChkBuilder.CreateMul(
+ RuntimeVF, ConstantInt::get(Ty, IC * AccessSize));
+ Value *Diff =
+ Expander.expandCodeFor(SE.getMinusSCEV(SinkStart, SrcStart), Ty, Loc);
+
+ // Check if the same compare has already been created earlier. In that
+ // case, there is no need to check it again.
+ IsConflict = SeenCompares.lookup({Diff, VFTimesICTimesSize});
+ if (IsConflict)
+ continue;
- IsConflict =
- ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
- SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+ IsConflict =
+ ChkBuilder.CreateICmpULT(Diff, VFTimesICTimesSize, "diff.check");
+ SeenCompares.insert({{Diff, VFTimesICTimesSize}, IsConflict});
+ } else {
+ Function *LoopDepMaskIntr =
+ Intrinsic::getOrInsertDeclaration(M, LoopDeskMaskIID, {MaskTy});
+
+ Value *Src = Expander.expandCodeFor(SrcStart, Ty, Loc);
+ Value *SrcPtr = ChkBuilder.CreateIntToPtr(Src, ChkBuilder.getPtrTy());
+ Value *Sink = Expander.expandCodeFor(SinkStart, Ty, Loc);
+ Value *SinkPtr = ChkBuilder.CreateIntToPtr(Sink, ChkBuilder.getPtrTy());
+ Value *Mask = ChkBuilder.CreateCall(
+ LoopDepMaskIntr->getFunctionType(), LoopDepMaskIntr,
+ {SrcPtr, SinkPtr, LoopAccessSize}, "loop.dep.mask");
+
+ IsConflict = SeenCompares.lookup({Mask, nullptr});
+ if (IsConflict)
+ continue;
+
+ Value *LastLaneIdx = ChkBuilder.CreateSub(
+ ChkBuilder.CreateElementCount(CheckTy, MaskTy->getElementCount()),
+ ChkBuilder.getIntN(Ty->getScalarSizeInBits(), 1));
+ Value *NoConflict =
+ ChkBuilder.CreateExtractElement(Mask, LastLaneIdx, "no.conflict");
+
+ IsConflict = ChkBuilder.CreateNot(NoConflict, "is.conflict");
+ SeenCompares.insert({{Mask, nullptr}, IsConflict});
+ }
if (NeedsFreeze)
IsConflict =
ChkBuilder.CreateFreeze(IsConflict, IsConflict->getName() + ".fr");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cdc6ecfa21bcb..4dc524c7b8582 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1876,15 +1876,9 @@ class GeneratedRTChecks {
auto DiffChecks = RtPtrChecking.getDiffChecks();
if (DiffChecks) {
- Value *RuntimeVF = nullptr;
- MemRuntimeCheckCond = addDiffRuntimeChecks(
- MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
- [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
- if (!RuntimeVF)
- RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
- return RuntimeVF;
- },
- IC);
+ MemRuntimeCheckCond =
+ addDiffRuntimeChecks(MemCheckBlock->getTerminator(), *DiffChecks,
+ MemCheckExp, VF, IC, *TTI, CostKind);
} else {
MemRuntimeCheckCond = addRuntimeChecks(
MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
diff --git a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
index 5b3070fcf347e..7acd776a91b3c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
@@ -17,10 +17,10 @@ define void @loop_dependence_war_mask(ptr %a, ptr %b) {
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-LABEL: 'loop_dependence_war_mask'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res5 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res6 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 2)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res7 = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 4)
@@ -54,10 +54,10 @@ define void @loop_dependence_raw_mask(ptr %a, ptr %b) {
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-LABEL: 'loop_dependence_raw_mask'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res5 = call <vscale x 16 x i1> @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res6 = call <vscale x 8 x i1> @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 2)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res7 = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
new file mode 100644
index 0000000000000..2c16dded9dff8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll
@@ -0,0 +1,284 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4
+; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s
+
+define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define dso_local void @alias_mask(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[LOOP_DEP_MASK:%.*]] = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr [[B]], ptr [[C]], i64 1)
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP5]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP19]], 1
+; CHECK-NEXT: [[NO_CONFLICT:%.*]] = extractelement <vscale x 16 x i1> [[LOOP_DEP_MASK]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[NO_CONFLICT]], true
+; CHECK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP8]], 4
+; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP24]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP14]]
+; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP17]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP16]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP20]], ptr align 1 [[TMP21]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]])
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT: [[TMP26:%.*]] = xor i1 [[TMP23]], true
+; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+;
+entry:
+ %cmp11 = icmp sgt i64 %n, 0
+ br i1 %cmp11, label %for.body, label %exit
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+ %load.a = load i8, ptr %gep.a, align 1
+ %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+ %load.b = load i8, ptr %gep.b, align 1
+ %add = add i8 %load.b, %load.a
+ %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv
+ store i8 %add, ptr %gep.c, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit: ; preds = %for.body, %entry
+ ret void
+}
+
+define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) {
+; CHECK-LABEL: define i32 @alias_mask_read_after_write(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[LOOP_DEP_MASK:%.*]] = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr [[C]], ptr [[B]], i64 4)
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP13]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP20]], 1
+; CHECK-NEXT: [[NO_CONFLICT:%.*]] = extractelement <vscale x 4 x i1> [[LOOP_DEP_MASK]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = xor i1 [[NO_CONFLICT]], true
+; CHECK-NEXT: br i1 [[TMP10]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP9]], 2
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP27]], 2
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP18]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP18]]
+; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [...
[truncated]
|
|
Note: The first commit in this PR is #175538 (which avoids enabling this for a bunch of fixed-length cases). |
b630be3 to
1ffc45c
Compare
| IsConflict = SeenCompares.lookup({Mask, nullptr}); | ||
| if (IsConflict) | ||
| continue; | ||
|
|
There was a problem hiding this comment.
From looking at the tests, I don't think this is currently covered? Something similar to the @test_large_number_of_group test should trigger it.
There was a problem hiding this comment.
This was not working (since the SCEV diff allows for common offsets to fold away). I've reworked this so we create: loop.dependence.war.mask(ptr null, ptr %diff) (and cache on %diff), a later inst/dag combine can fold %diff into the mask if it's a sub (TODO).
fhahn
left a comment
There was a problem hiding this comment.
Note: Diff >= AccessSize * (VF * IC) is the same as Diff > AccessSize * (VF * IC - 1) assuming aligned pointers.
The current code does not check that. From the note, it sounds like something that should be checked?
I now track the common alignment, and reject alignments less than the |
0f67b4c to
f2126aa
Compare
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
This patch updates `addDiffRuntimeChecks()` to use the last element of a `loop.dependence.war.mask` for the check when it is "cheap". This is currently determined by the cost model returning a cost <= 1. Using the last element of a `loop.dependence.war.mask` has semantics similar to the current diff checks, allows for the legal `Diff == 0` case. The current checks are equivalent to: ``` if (Diff < 0 || Diff >= AccessSize * (VF * IC)) // vector loop ``` Whereas using `loop.dependence.war.mask` is equivalent to: ``` if (Diff <= 0 || Diff > AccessSize * (VF * IC - 1)) // vector loop ``` Note: `Diff >= AccessSize * (VF * IC)` is the same as `Diff > AccessSize * (VF * IC - 1)` assuming aligned pointers. On AArch64 with SVE2 this allows the diff checks to be lowered to: ``` whilewr p0.s, x1, x2 b.nlast .Lscalar_loop ``` Note: This is similar to https://reviews.llvm.org/D138652 WIP Fixups Rm Update checks
acd952a to
7e01bdd
Compare
Why not update the current checks to allow that first? |
My understanding the the current runtime checks deliberately ignore that case to allow for smaller code-size. For AArch64 we can maintain the smaller code-size and allow that case with |
fhahn
left a comment
There was a problem hiding this comment.
Why not update the current checks to allow that first?
My understanding the the current runtime checks deliberately ignore that case to allow for smaller code-size. For AArch64 we can maintain the smaller code-size and allow that case with
whilewr.
FWIW, #188462 should take care of that.
This patch updates
addDiffRuntimeChecks()to use the last element of aloop.dependence.war.maskfor the check when it is "cheap". This is currently determined by the cost model returning a cost <= 1.Using the last element of a
loop.dependence.war.maskhas semantics similar to the current diff checks, but allows for the legalDiff == 0case.The current checks are equivalent to:
Whereas using
loop.dependence.war.maskis equivalent to:Note:
Diff >= AccessSize * (VF * IC)is the same asDiff > AccessSize * (VF * IC - 1)assuming aligned pointers.On AArch64 with SVE2 this allows the diff checks to be lowered to:
Note: This is similar to https://reviews.llvm.org/D138652