diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 13883883d3981..5bc26622d6093 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -25020,58 +25020,65 @@ This is an overloaded intrinsic. :: - declare <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) - declare <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) - declare <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) - declare @llvm.loop.dependence.war.mask.nxv16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %pointerDiff, i64 immarg %elementSize) + declare <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i32(i32 %pointerDiff, i64 immarg %elementSize) + declare <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointerDiff, i64 immarg %elementSize) + declare @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointerDiff, i64 immarg %elementSize) Overview: """"""""" -Given a vector load from %ptrA followed by a vector store to %ptrB, this -instruction generates a mask where an active lane indicates that the -write-after-read sequence can be performed safely for that lane, without the -danger of a write-after-read hazard occurring. +For a given pointer difference between a vector load and a vector store +(i.e., `%storePointer - %loadPointer`), where the load occurs before the store, +this instruction generates a mask where active lanes indicate a write-after-read +sequence can be performed safely. -A write-after-read hazard occurs when a write-after-read sequence for a given -lane in a vector ends up being executed as a read-after-write sequence due to -the aliasing of pointers. +Lanes are inactive when a write-after-read hazard may occur, which happens when +the load of a lane depends on a value that has yet to be written. See below for +examples. Arguments: """""""""" -The first two arguments are pointers and the last argument is an immediate. -The result is a vector with the i1 element type. +The first argument is a signed pointer difference. The second argument is an +immediate describing the element size in bytes. + +The result is a vector with an i1 element type. Semantics: """""""""" -``%elementSize`` is the size of the accessed elements in bytes. -The intrinsic returns ``poison`` if the distance between ``%prtA`` and ``%ptrB`` -is smaller than ``VF * %elementsize`` and either ``%ptrA + VF * %elementSize`` -or ``%ptrB + VF * %elementSize`` wrap. +``%elementSize`` is the size of the accessed elements in bytes. It is assumed +both the load and store use the same vector type. + +The intrinsic returns ``poison`` if ``%pointerDiff`` is not a multiple of +``%elementSize``. -The element of the result mask is active when loading from %ptrA then storing to -%ptrB is safe and doesn't result in a write-after-read hazard, meaning that: +For each lane of the mask, a lane is active if any of the following hold: -* (ptrB - ptrA) <= 0 (guarantees that all lanes are loaded before any stores), or -* elementSize * lane < (ptrB - ptrA) (guarantees that this lane is loaded - before the store to the same address) +* ``%pointerDiff <= 0`` (this is a signed comparison) + - All lanes can be loaded without depending on any yet to written values +* ``(%elementSize * lane) < %pointerDiff`` + - This lane can be loaded without depending on a yet to written value + - ``lane`` is in the range ``[0, VF)`` Examples: """"""""" .. code-block:: llvm - %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4) - %vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask, <4 x i32> poison) + %aAddr = ptrtoaddr ptr %ptrA to i64 + %bAddr = ptrtoaddr ptr %ptrB to i64 + %pointerDiff = sub i64 %bAddr, %aAddr + %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %pointerDiff, i64 4) + %vecA = call <4 x i32> @llvm.masked.load.v4i32.v4i32.p0(ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask, <4 x i32> poison) [...] - call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask) + call @llvm.masked.store.v4i32.p0(<4 x i32> %vecA, ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask) ; For the above example, consider the following cases: ; - ; 1. ptrA >= ptrB + ; 1. ptrA >= ptrB (pointerDiff <= 0) ; ; load = <0,1,2,3> ; uint32_t load = array[i+2]; ; store = <0,1,2,3> ; array[i] = store; @@ -25079,7 +25086,7 @@ Examples: ; This results in an all-true mask, as the load always occurs before the ; store, so it does not depend on any values to be stored. ; - ; 2. ptrB - ptrA = 2 * elementSize: + ; 2. pointerDiff = 2 * elementSize: ; ; load = <0,1,2,3> ; uint32_t load = array[i]; ; store = <0,1,2,3> ; array[i+2] = store; @@ -25088,7 +25095,7 @@ Examples: ; we can only read two lanes before we would read values that have yet to ; be written. ; - ; 3. ptrB - ptrA = 4 * elementSize + ; 3. pointerDiff = 4 * elementSize ; ; load = <0,1,2,3> ; uint32_t load = array[i]; ; store = <0,1,2,3> ; array[i+4] = store; @@ -25107,71 +25114,75 @@ This is an overloaded intrinsic. :: - declare <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) - declare <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) - declare <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) - declare @llvm.loop.dependence.raw.mask.nxv16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %pointerDiff, i64 immarg %elementSize) + declare <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i32(i32 %pointerDiff, i64 immarg %elementSize) + declare <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %pointerDiff, i64 immarg %elementSize) + declare @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %pointerDiff, i64 immarg %elementSize) Overview: """"""""" -Given a vector store to %ptrA followed by a vector load from %ptrB, this -instruction generates a mask where an active lane indicates that the -read-after-write sequence can be performed safely for that lane, without a -read-after-write hazard or a store-to-load forwarding hazard being introduced. - -A read-after-write hazard occurs when a read-after-write sequence for a given -lane in a vector ends up being executed as a write-after-read sequence due to -the aliasing of pointers. +For a given pointer difference between a vector store and a vector load +(i.e., `%loadPointer - %storePointer`), where the store occurs before the load, +this instruction generates a mask where active lanes indicate a read-after-write +sequence can be performed safely. -A store-to-load forwarding hazard occurs when a vector store writes to an -address that partially overlaps with the address of a subsequent vector load, -meaning that the vector load can't be performed until the vector store is -complete. +Lanes are inactive where there is a danger of a read-after-write or +store-to-load forwarding hazards occurring. A read-after-write hazard occurs +when a lane is overwritten before it can be read. A store-to-load forwarding +hazard occurs when a vector load partially depends on an immediately preceding +store. See below for examples. Arguments: """""""""" -The first two arguments are pointers and the last argument is an immediate. -The result is a vector with the i1 element type. +The first argument is a signed pointer difference. The second argument is an +immediate describing the element size in bytes. + +The result is a vector with an i1 element type. Semantics: """""""""" -``%elementSize`` is the size of the accessed elements in bytes. -The intrinsic returns ``poison`` if the distance between ``%prtA`` and ``%ptrB`` -is smaller than ``VF * %elementsize`` and either ``%ptrA + VF * %elementSize`` -or ``%ptrB + VF * %elementSize`` wrap. +``%elementSize`` is the size of the accessed elements in bytes. It is assumed +both the load and store use the same vector type. + +The intrinsic returns ``poison`` if ``%pointerDiff`` is not a multiple of +``%elementSize``. -The element of the result mask is active when storing to %ptrA then loading from -%ptrB is safe and doesn't result in aliasing, meaning that: +For each lane of the mask, a lane is active if any of the following hold: -* elementSize * lane < abs(ptrB - ptrA) (guarantees that the store of this lane - occurs before loading from this address), or -* ptrA == ptrB (doesn't introduce any new hazards that weren't in the scalar - code) +* ``%pointerDiff == 0`` + - No read-after-write hazard and the store likely can be forwarded to the load +* ``%elementSize * lane < abs(%pointerDiff)`` + - A write of this lane does not overlap with the succeeding load (``%pointerDiff > 0``) + - A read of this lane does not depend on the preceding store (``%pointerDiff < 0``) + - ``lane`` is in the range ``[0, VF)`` Examples: """"""""" .. code-block:: llvm - %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4) - call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask) + %aAddr = ptrtoaddr ptr %ptrA to i64 + %bAddr = ptrtoaddr ptr %ptrB to i64 + %pointerDiff = sub i64 %bAddr, %aAddr + %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(i64 %pointerDiff, i64 4) + call @llvm.masked.store.v4i32.v4i32.p0(<4 x i32> %vecA, ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask) [...] - %vecB = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask, <4 x i32> poison) + %vecB = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask, <4 x i32> poison) ; For the above example, consider the following cases: ; - ; 1. ptrA == ptrB + ; 1. ptrA == ptrB (pointerDiff == 0) ; ; store = <0,1,2,3> ; array[i] = store; ; load = <0,1,2,3> ; uint32_t load = array[i]; ; ; This results in a all-true mask. There is no conflict. ; - ; 2. ptrB - ptrA = 2 * elementSize + ; 2. pointerDiff = 2 * elementSize ; ; store = <0,1,2,3> ; array[i] = store; ; load = <0,1,2,3> ; uint32_t load = array[i+2]; @@ -25179,7 +25190,7 @@ Examples: ; This results in a mask with the first two lanes active. In this case, ; only two lanes can be written without overwriting values yet to be read. ; - ; 3. ptrB - ptrA = -2 * elementSize + ; 3. pointerDiff = -2 * elementSize ; ; store = <0,1,2,3> ; array[i+2] = store; ; load = <0,1,2,3> ; uint32_t load = array[i]; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 7812a301efbd7..d63e98ef67f58 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2190,46 +2190,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // The possible expansions are... // // loop_dependence_war_mask: - // diff = (ptrB - ptrA) / eltSize - // cmp = icmp sle diff, 0 + // cmp = icmp sle (diff / eltSize), 0 // upper_bound = select cmp, -1, diff // mask = get_active_lane_mask 0, upper_bound // // loop_dependence_raw_mask: - // diff = (abs(ptrB - ptrA)) / eltSize - // cmp = icmp eq diff, 0 + // cmp = icmp eq (diff / eltSize), 0 // upper_bound = select cmp, -1, diff // mask = get_active_lane_mask 0, upper_bound // - auto *PtrTy = cast(ICA.getArgTypes()[0]); - Type *IntPtrTy = IntegerType::getIntNTy( - RetTy->getContext(), thisT()->getDataLayout().getPointerSizeInBits( - PtrTy->getAddressSpace())); + Type *IntTy = ICA.getArgTypes()[0]; bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask; - InstructionCost Cost = - thisT()->getArithmeticInstrCost(Instruction::Sub, IntPtrTy, CostKind); + TTI::OperandValueInfo EltSizeOpInfo = + TTI::getOperandInfo(ICA.getArgs()[1]); + InstructionCost Cost = thisT()->getArithmeticInstrCost( + Instruction::SDiv, IntTy, CostKind, {}, EltSizeOpInfo); + if (IsReadAfterWrite) { - IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, IntPtrTy, {IntPtrTy}, - {}); + IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, IntTy, {IntTy}, {}); Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind); } - TTI::OperandValueInfo EltSizeOpInfo = - TTI::getOperandInfo(ICA.getArgs()[2]); - Cost += thisT()->getArithmeticInstrCost(Instruction::SDiv, IntPtrTy, - CostKind, {}, EltSizeOpInfo); - Type *CondTy = IntegerType::getInt1Ty(RetTy->getContext()); CmpInst::Predicate Pred = IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE; - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CondTy, - IntPtrTy, Pred, CostKind); - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, IntPtrTy, - CondTy, Pred, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CondTy, IntTy, + Pred, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, IntTy, CondTy, + Pred, CostKind); IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, - {IntPtrTy, IntPtrTy}, FMF); + {IntTy, IntTy}, FMF); Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); return Cost; } diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4469ff155b854..e1ca27f9fb0f4 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2511,13 +2511,13 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, ImmArg>] in { def int_loop_dependence_raw_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [llvm_ptr_ty, llvm_ptr_ty, llvm_i64_ty], - [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; + [llvm_anyint_ty, llvm_i64_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; def int_loop_dependence_war_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [llvm_ptr_ty, llvm_ptr_ty, llvm_i64_ty], - [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; + [llvm_anyint_ty, llvm_i64_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; def int_get_active_lane_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 82f8fd572bf19..0e0b2c8f1f7f3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -535,6 +535,7 @@ namespace { SDValue visitBRCOND(SDNode *N); SDValue visitBR_CC(SDNode *N); SDValue visitLOAD(SDNode *N); + SDValue visitLOOP_DEPENDENCE_MASK(SDNode *N); SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); SDValue replaceStoreOfFPConstant(StoreSDNode *ST); @@ -2095,6 +2096,9 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAXIMUM: case ISD::VECREDUCE_FMINIMUM: return visitVECREDUCE(N); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + return visitLOOP_DEPENDENCE_MASK(N); #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC: #include "llvm/IR/VPIntrinsics.def" return visitVPOp(N); @@ -21132,6 +21136,22 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { return SDValue(); } +// Fold LOOP_DEPENDENCE_MASK(0, sub(B, A)) to LOOP_DEPENDENCE_MASK(A, B). +SDValue DAGCombiner::visitLOOP_DEPENDENCE_MASK(SDNode *N) { + auto *Op0Const = dyn_cast(N->getOperand(0)); + if (!Op0Const || !Op0Const->isZero()) + return SDValue(); + + SDValue Op1 = N->getOperand(1); + if (Op1.getOpcode() != ISD::SUB) + return SDValue(); + + SDValue Op10 = Op1->getOperand(0); + SDValue Op11 = Op1->getOperand(1); + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op11, Op10, + N->getOperand(2), N->getOperand(3)); +} + namespace { /// Helper structure used to slice a load in smaller loads. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 04b17b56b3d49..efa9228d49c2c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8511,18 +8511,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::loop_dependence_war_mask: - setValue(&I, - DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl, - EVT::getEVT(I.getType()), getValue(I.getOperand(0)), - getValue(I.getOperand(1)), getValue(I.getOperand(2)), - DAG.getConstant(0, sdl, MVT::i64))); - return; case Intrinsic::loop_dependence_raw_mask: - setValue(&I, - DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl, - EVT::getEVT(I.getType()), getValue(I.getOperand(0)), - getValue(I.getOperand(1)), getValue(I.getOperand(2)), - DAG.getConstant(0, sdl, MVT::i64))); + unsigned Opcode = Intrinsic == Intrinsic::loop_dependence_war_mask + ? ISD::LOOP_DEPENDENCE_WAR_MASK + : ISD::LOOP_DEPENDENCE_RAW_MASK; + SDValue PointerDiff = getValue(I.getOperand(0)); + SDValue Zero = DAG.getConstant(0, sdl, PointerDiff.getValueType()); + setValue(&I, DAG.getNode(Opcode, sdl, EVT::getEVT(I.getType()), Zero, + PointerDiff, getValue(I.getOperand(1)), + DAG.getConstant(0, sdl, MVT::i64))); return; } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 38db1ac4a2fb9..34b9afd1878f6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5578,6 +5578,19 @@ AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, SDLoc DL(Op); EVT VT = Op.getValueType(); + SDValue Op0 = Op.getOperand(0); + + // Handle operands less than 64-bit (the diff must be sign extended). + if (Op0.getValueType() != MVT::i64) { + assert(Op0.getValueSizeInBits() < 64); + SDValue Op1 = Op.getOperand(1); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue Diff = DAG.getNode(ISD::SUB, DL, Op0.getValueType(), Op1, Op0); + SDValue DiffExt = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Diff); + return DAG.getNode(Op.getOpcode(), DL, VT, + {Zero, DiffExt, Op.getOperand(2), Op.getOperand(3)}); + } + unsigned LaneOffset = Op.getConstantOperandVal(3); unsigned NumElements = VT.getVectorMinNumElements(); uint64_t EltSizeInBytes = Op.getConstantOperandVal(2); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ae7144155ad72..5b27b70c7f006 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1092,7 +1092,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ST->hasSVE2() || ST->hasSME()) { EVT VecVT = getTLI()->getValueType(DL, RetTy); unsigned EltSizeInBytes = - cast(ICA.getArgs()[2])->getZExtValue(); + cast(ICA.getArgs()[1])->getZExtValue(); if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) || VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes)) break; diff --git a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll index 74bd41db4a64d..e80644a5bdb90 100644 --- a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll +++ b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll @@ -4,186 +4,186 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefixes=CHECK,CHECK-SME ; loop.dependence.{war,raw}.mask can be lowered to while{wr,rw} if SVE2 or SME is enabled. -define void @loop_dependence_war_mask(ptr %a, ptr %b) { +define void @loop_dependence_war_mask(i64 %diff) { ; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask' -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res5 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res5 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 8) ; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LABEL: 'loop_dependence_war_mask' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res5 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 4) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res5 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %diff, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %diff, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %diff, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 8) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: - %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) - %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) - %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) - %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) + %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 1) + %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 2) + %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 4) + %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 8) - %res5 = call @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) - %res6 = call @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) - %res7 = call @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) - %res8 = call @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) + %res5 = call @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 1) + %res6 = call @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 2) + %res7 = call @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 4) + %res8 = call @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 8) ret void } -define void @loop_dependence_raw_mask(ptr %a, ptr %b) { +define void @loop_dependence_raw_mask(i64 %diff) { ; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask' -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res5 = call @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res5 = call @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv8i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv4i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 8) ; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LABEL: 'loop_dependence_raw_mask' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res5 = call @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 1) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 2) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 4) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res5 = call @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %diff, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv8i1.i64(i64 %diff, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv4i1.i64(i64 %diff, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 8) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: - %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) - %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) - %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) - %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) + %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 1) + %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 2) + %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 4) + %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 8) - %res5 = call @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) - %res6 = call @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) - %res7 = call @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) - %res8 = call @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) + %res5 = call @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 1) + %res6 = call @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 2) + %res7 = call @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 4) + %res8 = call @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 8) ret void } ; Invalid element size and return type combinations must be expanded, even with sve2/sme -define void @loop_dependence_war_mask_invalid(ptr %a, ptr %b) { +define void @loop_dependence_war_mask_invalid(i64 %diff) { ; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask_invalid' -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 8) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res9 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res10 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 10) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 10) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %diff, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res9 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res10 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 10) ; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE2-LABEL: 'loop_dependence_war_mask_invalid' -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 8) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 2) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res9 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 1) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res10 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 10) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 8) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 4) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 2) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 1) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 10) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %diff, i64 8) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %diff, i64 4) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %diff, i64 2) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res9 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 1) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res10 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 10) ; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SME-LABEL: 'loop_dependence_war_mask_invalid' -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 8) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 2) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res9 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 1) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res10 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 10) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 8) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 4) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 2) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 1) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 10) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res6 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %diff, i64 8) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res7 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %diff, i64 4) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res8 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %diff, i64 2) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res9 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 1) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res10 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %diff, i64 10) ; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: - %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) - %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) - %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) - %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) - %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) + %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 8) + %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 4) + %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 2) + %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 1) + %res5 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 10) - %res6 = call @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) - %res7 = call @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) - %res8 = call @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) - %res9 = call @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) - %res10 = call @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) + %res6 = call @llvm.loop.dependence.war.mask.v16i1.i64(i64 %diff, i64 8) + %res7 = call @llvm.loop.dependence.war.mask.v8i1.i64(i64 %diff, i64 4) + %res8 = call @llvm.loop.dependence.war.mask.v4i1.i64(i64 %diff, i64 2) + %res9 = call @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 1) + %res10 = call @llvm.loop.dependence.war.mask.v2i1.i64(i64 %diff, i64 10) ret void } -define void @loop_dependence_raw_mask_invalid(ptr %a, ptr %b) { +define void @loop_dependence_raw_mask_invalid(i64 %diff) { ; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask_invalid' -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 8) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 4) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 2) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 1) -; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res10 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 10) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 10) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %diff, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv8i1.i64(i64 %diff, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv4i1.i64(i64 %diff, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res9 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res10 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 10) ; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE2-LABEL: 'loop_dependence_raw_mask_invalid' -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 8) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 4) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 2) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 1) -; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res10 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 10) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 8) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 4) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 2) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 1) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 10) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %diff, i64 8) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv8i1.i64(i64 %diff, i64 4) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv4i1.i64(i64 %diff, i64 2) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res9 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 1) +; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res10 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 10) ; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SME-LABEL: 'loop_dependence_raw_mask_invalid' -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) -; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 8) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 4) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 2) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res9 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 1) -; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res10 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 10) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 8) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 4) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 2) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 1) +; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 10) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res6 = call @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %diff, i64 8) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res7 = call @llvm.loop.dependence.raw.mask.nxv8i1.i64(i64 %diff, i64 4) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res8 = call @llvm.loop.dependence.raw.mask.nxv4i1.i64(i64 %diff, i64 2) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res9 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 1) +; CHECK-SME-NEXT: Cost Model: Invalid cost for instruction: %res10 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %diff, i64 10) ; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: - %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) - %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) - %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) - %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) - %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) + %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 8) + %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 4) + %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 2) + %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 1) + %res5 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 10) - %res6 = call @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) - %res7 = call @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) - %res8 = call @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) - %res9 = call @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) - %res10 = call @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) + %res6 = call @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %diff, i64 8) + %res7 = call @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %diff, i64 4) + %res8 = call @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %diff, i64 2) + %res9 = call @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 1) + %res10 = call @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %diff, i64 10) ret void } diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll index 42833aa19a7fd..445b9485403a7 100644 --- a/llvm/test/CodeGen/AArch64/alias_mask.ll +++ b/llvm/test/CodeGen/AArch64/alias_mask.ll @@ -9,7 +9,10 @@ define <16 x i1> @whilewr_8(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointer_diff, i64 1) ret <16 x i1> %0 } @@ -21,7 +24,10 @@ define <8 x i1> @whilewr_16(ptr %a, ptr %b) { ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret entry: - %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %pointer_diff, i64 2) ret <8 x i1> %0 } @@ -33,7 +39,10 @@ define <4 x i1> @whilewr_32(ptr %a, ptr %b) { ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret entry: - %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %pointer_diff, i64 4) ret <4 x i1> %0 } @@ -45,7 +54,10 @@ define <2 x i1> @whilewr_64(ptr %a, ptr %b) { ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: - %0 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %pointer_diff, i64 8) ret <2 x i1> %0 } @@ -57,7 +69,10 @@ define <16 x i1> @whilerw_8(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %pointer_diff, i64 1) ret <16 x i1> %0 } @@ -69,7 +84,10 @@ define <8 x i1> @whilerw_16(ptr %a, ptr %b) { ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret entry: - %0 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i64(i64 %pointer_diff, i64 2) ret <8 x i1> %0 } @@ -81,7 +99,10 @@ define <4 x i1> @whilerw_32(ptr %a, ptr %b) { ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret entry: - %0 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %pointer_diff, i64 4) ret <4 x i1> %0 } @@ -93,7 +114,10 @@ define <2 x i1> @whilerw_64(ptr %a, ptr %b) { ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret entry: - %0 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1.i64(i64 %pointer_diff, i64 8) ret <2 x i1> %0 } @@ -122,7 +146,10 @@ define <32 x i1> @whilewr_8_split(ptr %a, ptr %b) { ; CHECK-NEXT: str h1, [x8, #2] ; CHECK-NEXT: ret entry: - %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1.i64(i64 %pointer_diff, i64 1) ret <32 x i1> %0 } @@ -167,7 +194,10 @@ define <64 x i1> @whilewr_8_split2(ptr %a, ptr %b) { ; CHECK-NEXT: str h1, [x8, #2] ; CHECK-NEXT: ret entry: - %0 = call <64 x i1> @llvm.loop.dependence.war.mask.v64i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <64 x i1> @llvm.loop.dependence.war.mask.v64i1.i64(i64 %pointer_diff, i64 1) ret <64 x i1> %0 } @@ -184,7 +214,10 @@ define <16 x i1> @whilewr_16_expand(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointer_diff, i64 2) ret <16 x i1> %0 } @@ -215,7 +248,10 @@ define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) { ; CHECK-NEXT: str h1, [x8] ; CHECK-NEXT: ret entry: - %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1.i64(i64 %pointer_diff, i64 2) ret <32 x i1> %0 } @@ -233,7 +269,10 @@ define <8 x i1> @whilewr_32_expand(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %pointer_diff, i64 4) ret <8 x i1> %0 } @@ -251,7 +290,10 @@ define <16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointer_diff, i64 4) ret <16 x i1> %0 } @@ -283,7 +325,10 @@ define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) { ; CHECK-NEXT: str h1, [x8] ; CHECK-NEXT: ret entry: - %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1.i64(i64 %pointer_diff, i64 4) ret <32 x i1> %0 } @@ -301,7 +346,10 @@ define <4 x i1> @whilewr_64_expand(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %pointer_diff, i64 8) ret <4 x i1> %0 } @@ -319,7 +367,10 @@ define <8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %pointer_diff, i64 8) ret <8 x i1> %0 } @@ -337,7 +388,10 @@ define <16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointer_diff, i64 8) ret <16 x i1> %0 } @@ -369,7 +423,10 @@ define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) { ; CHECK-NEXT: str h1, [x8] ; CHECK-NEXT: ret entry: - %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1.i64(i64 %pointer_diff, i64 8) ret <32 x i1> %0 } @@ -401,7 +458,10 @@ define <9 x i1> @whilewr_8_widen(ptr %a, ptr %b) { ; CHECK-NEXT: strh w9, [x8] ; CHECK-NEXT: ret entry: - %0 = call <9 x i1> @llvm.loop.dependence.war.mask.v9i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <9 x i1> @llvm.loop.dependence.war.mask.v9i1.i64(i64 %pointer_diff, i64 1) ret <9 x i1> %0 } @@ -420,7 +480,10 @@ define <7 x i1> @whilewr_16_widen(ptr %a, ptr %b) { ; CHECK-NEXT: umov w6, v0.b[6] ; CHECK-NEXT: ret entry: - %0 = call <7 x i1> @llvm.loop.dependence.war.mask.v7i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <7 x i1> @llvm.loop.dependence.war.mask.v7i1.i64(i64 %pointer_diff, i64 2) ret <7 x i1> %0 } @@ -435,7 +498,10 @@ define <3 x i1> @whilewr_32_widen(ptr %a, ptr %b) { ; CHECK-NEXT: umov w2, v0.h[2] ; CHECK-NEXT: ret entry: - %0 = call <3 x i1> @llvm.loop.dependence.war.mask.v3i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <3 x i1> @llvm.loop.dependence.war.mask.v3i1.i64(i64 %pointer_diff, i64 4) ret <3 x i1> %0 } @@ -454,7 +520,10 @@ define <16 x i1> @whilewr_badimm(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 3) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointer_diff, i64 3) ret <16 x i1> %0 } @@ -466,7 +535,10 @@ define <1 x i1> @whilewr_8_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1.i64(i64 %pointer_diff, i64 1) ret <1 x i1> %0 } @@ -476,7 +548,10 @@ define <1 x i1> @whilewr_16_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1.i64(i64 %pointer_diff, i64 2) ret <1 x i1> %0 } @@ -486,7 +561,10 @@ define <1 x i1> @whilewr_32_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1.i64(i64 %pointer_diff, i64 4) ret <1 x i1> %0 } @@ -496,7 +574,10 @@ define <1 x i1> @whilewr_64_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1.i64(i64 %pointer_diff, i64 8) ret <1 x i1> %0 } @@ -506,7 +587,10 @@ define <1 x i1> @whilerw_8_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1.i64(i64 %pointer_diff, i64 1) ret <1 x i1> %0 } @@ -516,7 +600,10 @@ define <1 x i1> @whilerw_16_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1.i64(i64 %pointer_diff, i64 2) ret <1 x i1> %0 } @@ -526,7 +613,10 @@ define <1 x i1> @whilerw_32_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1.i64(i64 %pointer_diff, i64 4) ret <1 x i1> %0 } @@ -536,7 +626,10 @@ define <1 x i1> @whilerw_64_scalarize(ptr %a, ptr %b) { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret entry: - %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1.i64(i64 %pointer_diff, i64 8) ret <1 x i1> %0 } @@ -548,7 +641,10 @@ define <8 x i1> @whilewr_extract_v8i1(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i64(i64 %pointer_diff, i64 1) ret <8 x i1> %0 } @@ -561,7 +657,10 @@ define <4 x i1> @whilewr_extract_v4i1(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %pointer_diff, i64 1) ret <4 x i1> %0 } @@ -573,6 +672,9 @@ define <2 x i1> @whilewr_extract_v2i1(ptr %a, ptr %b) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret entry: - %0 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1.i64(i64 %pointer_diff, i64 4) ret <2 x i1> %0 } diff --git a/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll index 0b1221244a757..98a3e662e0c82 100644 --- a/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll +++ b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll @@ -6,6 +6,9 @@ define <16 x i1> @whilewr_8(ptr %a, ptr %b) { entry: - %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointer_diff, i64 1) ret <16 x i1> %0 } diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll index e9463b5c571b6..b8b9a027addcf 100644 --- a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll +++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll @@ -7,7 +7,10 @@ define @whilewr_8(ptr %a, ptr %b) { ; CHECK-NEXT: whilewr p0.b, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointer_diff, i64 1) ret %0 } @@ -17,7 +20,10 @@ define @whilewr_16(ptr %a, ptr %b) { ; CHECK-NEXT: whilewr p0.h, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %pointer_diff, i64 2) ret %0 } @@ -27,7 +33,10 @@ define @whilewr_32(ptr %a, ptr %b) { ; CHECK-NEXT: whilewr p0.s, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %pointer_diff, i64 4) ret %0 } @@ -37,7 +46,10 @@ define @whilewr_64(ptr %a, ptr %b) { ; CHECK-NEXT: whilewr p0.d, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %pointer_diff, i64 8) ret %0 } @@ -47,7 +59,10 @@ define @whilerw_8(ptr %a, ptr %b) { ; CHECK-NEXT: whilerw p0.b, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %pointer_diff, i64 1) ret %0 } @@ -57,7 +72,10 @@ define @whilerw_16(ptr %a, ptr %b) { ; CHECK-NEXT: whilerw p0.h, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.raw.mask.nxv8i1.i64(i64 %pointer_diff, i64 2) ret %0 } @@ -67,7 +85,10 @@ define @whilerw_32(ptr %a, ptr %b) { ; CHECK-NEXT: whilerw p0.s, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.raw.mask.nxv4i1.i64(i64 %pointer_diff, i64 4) ret %0 } @@ -77,7 +98,10 @@ define @whilerw_64(ptr %a, ptr %b) { ; CHECK-NEXT: whilerw p0.d, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.raw.mask.nxv2i1.i64(i64 %pointer_diff, i64 8) ret %0 } @@ -92,7 +116,10 @@ define @whilewr_8_split(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p1.b, x8, x9 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv32i1.i64(i64 %pointer_diff, i64 1) ret %0 } @@ -111,7 +138,10 @@ define @whilewr_8_split2(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p3.b, x8, x9 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv64i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv64i1.i64(i64 %pointer_diff, i64 1) ret %0 } @@ -126,7 +156,10 @@ define @whilewr_16_expand(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointer_diff, i64 2) ret %0 } @@ -143,7 +176,10 @@ define @whilewr_16_expand2(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p1.b, x8, x9 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv32i1.i64(i64 %pointer_diff, i64 2) ret %0 } @@ -159,7 +195,10 @@ define @whilewr_32_expand(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.h, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %pointer_diff, i64 4) ret %0 } @@ -175,7 +214,10 @@ define @whilewr_32_expand2(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointer_diff, i64 4) ret %0 } @@ -193,7 +235,10 @@ define @whilewr_32_expand3(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p1.b, x8, x9 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv32i1.i64(i64 %pointer_diff, i64 4) ret %0 } @@ -209,7 +254,10 @@ define @whilewr_64_expand(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.s, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %pointer_diff, i64 8) ret %0 } @@ -225,7 +273,10 @@ define @whilewr_64_expand2(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.h, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %pointer_diff, i64 8) ret %0 } @@ -241,7 +292,10 @@ define @whilewr_64_expand3(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointer_diff, i64 8) ret %0 } @@ -259,7 +313,10 @@ define @whilewr_64_expand4(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p1.b, x8, x9 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 8) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv32i1.i64(i64 %pointer_diff, i64 8) ret %0 } @@ -269,7 +326,10 @@ define @whilewr_8_widen(ptr %a, ptr %b) { ; CHECK-NEXT: whilewr p0.b, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv9i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv9i1.i64(i64 %pointer_diff, i64 1) ret %0 } @@ -279,7 +339,10 @@ define @whilewr_16_widen(ptr %a, ptr %b) { ; CHECK-NEXT: whilewr p0.h, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv7i1(ptr %a, ptr %b, i64 2) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv7i1.i64(i64 %pointer_diff, i64 2) ret %0 } @@ -289,7 +352,10 @@ define @whilewr_32_widen(ptr %a, ptr %b) { ; CHECK-NEXT: whilewr p0.s, x0, x1 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv3i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv3i1.i64(i64 %pointer_diff, i64 4) ret %0 } @@ -306,7 +372,10 @@ define @whilewr_badimm(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 3) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointer_diff, i64 3) ret %0 } @@ -317,7 +386,10 @@ define @whilewr_extract_nxv8i1(ptr %a, ptr %b) { ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv8i1.i64(i64 %pointer_diff, i64 1) ret %0 } @@ -329,7 +401,10 @@ define @whilewr_extract_nxv4i1(ptr %a, ptr %b) { ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv4i1.i64(i64 %pointer_diff, i64 1) ret %0 } @@ -341,6 +416,33 @@ define @whilewr_extract_nxv2i1(ptr %a, ptr %b) { ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 4) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv2i1.i64(i64 %pointer_diff, i64 4) ret %0 } + +define @whilewr_i32_diff(i32 %a, i32 %b) { +; CHECK-LABEL: whilewr_i32_diff: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: sxtw x8, w8 +; CHECK-NEXT: whilewr p0.b, xzr, x8 +; CHECK-NEXT: ret +entry: + %pointer_diff = sub i32 %b, %a + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i32(i32 %pointer_diff, i64 1) + ret %0 +} + +; Test passing a difference that is not the result of a sub. +define @whilewr_no_sub(i64 %diff) { +; CHECK-LABEL: whilewr_no_sub: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, xzr, x0 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %diff, i64 1) + ret %0 +} diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll index 541e312757369..69134665a2900 100644 --- a/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll +++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll @@ -10,6 +10,9 @@ define @whilewr_8(ptr %a, ptr %b) { ; CHECK-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NEXT: ret entry: - %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) + %a_int = ptrtoaddr ptr %a to i64 + %b_int = ptrtoaddr ptr %b to i64 + %pointer_diff = sub i64 %b_int, %a_int + %0 = call @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointer_diff, i64 1) ret %0 }