llvm · MacDue · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -25020,66 +25020,73 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
-      declare <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
-      declare <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
-      declare <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
+      declare <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %pointerDiff, i64 immarg %elementSize)
+      declare <8 x i1> @llvm.loop.dependence.war.mask.v8i1.i32(i32 %pointerDiff, i64 immarg %elementSize)
+      declare <16 x i1> @llvm.loop.dependence.war.mask.v16i1.i64(i64 %pointerDiff, i64 immarg %elementSize)
+      declare <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1.i64(i64 %pointerDiff, i64 immarg %elementSize)
 
 
 Overview:
 """""""""
 
-Given a vector load from %ptrA followed by a vector store to %ptrB, this
-instruction generates a mask where an active lane indicates that the
-write-after-read sequence can be performed safely for that lane, without the
-danger of a write-after-read hazard occurring.
+For a given pointer difference between a vector load and a vector store
+(i.e., `%storePointer - %loadPointer`), where the load occurs before the store,
+this instruction generates a mask where active lanes indicate a write-after-read
+sequence can be performed safely.
 
-A write-after-read hazard occurs when a write-after-read sequence for a given
-lane in a vector ends up being executed as a read-after-write sequence due to
-the aliasing of pointers.
+Lanes are inactive when a write-after-read hazard may occur, which happens when
+the load of a lane depends on a value that has yet to be written. See below for
+examples.
 
 Arguments:
 """"""""""
 
-The first two arguments are pointers and the last argument is an immediate.
-The result is a vector with the i1 element type.
+The first argument is a signed pointer difference. The second argument is an
+immediate describing the element size in bytes.
+
+The result is a vector with an i1 element type.
 
 Semantics:
 """"""""""
 
-``%elementSize`` is the size of the accessed elements in bytes.
-The intrinsic returns ``poison`` if the distance between ``%prtA`` and ``%ptrB``
-is smaller than ``VF * %elementsize`` and either ``%ptrA + VF * %elementSize``
-or ``%ptrB + VF * %elementSize`` wrap.
+``%elementSize`` is the size of the accessed elements in bytes. It is assumed
+both the load and store use the same vector type.
+
+The intrinsic returns ``poison`` if ``%pointerDiff`` is not a multiple of
+``%elementSize``.
 
-The element of the result mask is active when loading from %ptrA then storing to
-%ptrB is safe and doesn't result in a write-after-read hazard, meaning that:
+For each lane of the mask, a lane is active if any of the following hold:
 
-* (ptrB - ptrA) <= 0 (guarantees that all lanes are loaded before any stores), or
-* elementSize * lane < (ptrB - ptrA) (guarantees that this lane is loaded
-  before the store to the same address)
+* ``%pointerDiff <= 0`` (this is a signed comparison)
+  - All lanes can be loaded without depending on any yet to written values
+* ``(%elementSize * lane) < %pointerDiff``
+  - This lane can be loaded without depending on a yet to written value
+  - ``lane`` is in the range ``[0, VF)``
 
 Examples:
 """""""""
 
 .. code-block:: llvm
 
-      %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4)
-      %vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
+      %aAddr = ptrtoaddr ptr %ptrA to i64
+      %bAddr = ptrtoaddr ptr %ptrB to i64
+      %pointerDiff = sub i64 %bAddr, %aAddr
+      %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1.i64(i64 %pointerDiff, i64 4)
+      %vecA = call <4 x i32> @llvm.masked.load.v4i32.v4i32.p0(ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
       [...]
-      call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask)
+      call @llvm.masked.store.v4i32.p0(<4 x i32> %vecA, ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask)
 
       ; For the above example, consider the following cases:
       ;
-      ; 1. ptrA >= ptrB
+      ; 1. ptrA >= ptrB (pointerDiff <= 0)
       ;
       ;   load =      <0,1,2,3>     ; uint32_t load = array[i+2];
       ;  store =  <0,1,2,3>         ; array[i] = store;
       ;
       ; This results in an all-true mask, as the load always occurs before the
       ; store, so it does not depend on any values to be stored.
       ;
-      ; 2. ptrB - ptrA = 2 * elementSize:
+      ; 2. pointerDiff = 2 * elementSize:
       ;
       ;   load =  <0,1,2,3>         ; uint32_t load = array[i];
       ;  store =      <0,1,2,3>     ; array[i+2] = store;
@@ -25088,7 +25095,7 @@ Examples:
       ; we can only read two lanes before we would read values that have yet to
       ; be written.
       ;
-      ; 3. ptrB - ptrA = 4 * elementSize
+      ; 3. pointerDiff = 4 * elementSize
       ;
       ;   load =  <0,1,2,3>         ; uint32_t load = array[i];
       ;  store =          <0,1,2,3> ; array[i+4] = store;
@@ -25107,79 +25114,83 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
-      declare <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
-      declare <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
-      declare <vscale x 16 x i1> @llvm.loop.dependence.raw.mask.nxv16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize)
+      declare <4 x i1> @llvm.loop.dependence.raw.mask.v4i1.i64(i64 %pointerDiff, i64 immarg %elementSize)
+      declare <8 x i1> @llvm.loop.dependence.raw.mask.v8i1.i32(i32 %pointerDiff, i64 immarg %elementSize)
+      declare <16 x i1> @llvm.loop.dependence.raw.mask.v16i1.i64(i64 %pointerDiff, i64 immarg %elementSize)
+      declare <vscale x 16 x i1> @llvm.loop.dependence.raw.mask.nxv16i1.i64(i64 %pointerDiff, i64 immarg %elementSize)
 
 
 Overview:
 """""""""
 
-Given a vector store to %ptrA followed by a vector load from %ptrB, this
-instruction generates a mask where an active lane indicates that the
-read-after-write sequence can be performed safely for that lane, without a
-read-after-write hazard or a store-to-load forwarding hazard being introduced.
-
-A read-after-write hazard occurs when a read-after-write sequence for a given
-lane in a vector ends up being executed as a write-after-read sequence due to
-the aliasing of pointers.
+For a given pointer difference between a vector store and a vector load
+(i.e., `%loadPointer - %storePointer`), where the store occurs before the load,
+this instruction generates a mask where active lanes indicate a read-after-write
+sequence can be performed safely.
 
-A store-to-load forwarding hazard occurs when a vector store writes to an
-address that partially overlaps with the address of a subsequent vector load,
-meaning that the vector load can't be performed until the vector store is
-complete.
+Lanes are inactive where there is a danger of a read-after-write or
+store-to-load forwarding hazards occurring. A read-after-write hazard occurs
+when a lane is overwritten before it can be read. A store-to-load forwarding
+hazard occurs when a vector load partially depends on an immediately preceding
+store. See below for examples.
 
 Arguments:
 """"""""""
 
-The first two arguments are pointers and the last argument is an immediate.
-The result is a vector with the i1 element type.
+The first argument is a signed pointer difference. The second argument is an
+immediate describing the element size in bytes.
+
+The result is a vector with an i1 element type.
 
 Semantics:
 """"""""""
 
-``%elementSize`` is the size of the accessed elements in bytes.
-The intrinsic returns ``poison`` if the distance between ``%prtA`` and ``%ptrB``
-is smaller than ``VF * %elementsize`` and either ``%ptrA + VF * %elementSize``
-or ``%ptrB + VF * %elementSize`` wrap.
+``%elementSize`` is the size of the accessed elements in bytes. It is assumed
+both the load and store use the same vector type.
+
+The intrinsic returns ``poison`` if ``%pointerDiff`` is not a multiple of
+``%elementSize``.
 
-The element of the result mask is active when storing to %ptrA then loading from
-%ptrB is safe and doesn't result in aliasing, meaning that:
+For each lane of the mask, a lane is active if any of the following hold:
 
-* elementSize * lane < abs(ptrB - ptrA) (guarantees that the store of this lane
-  occurs before loading from this address), or
-* ptrA == ptrB (doesn't introduce any new hazards that weren't in the scalar
-  code)
+* ``%pointerDiff == 0``
+  - No read-after-write hazard and the store likely can be forwarded to the load
+* ``%elementSize * lane < abs(%pointerDiff)``
+  - A write of this lane does not overlap with the succeeding load (``%pointerDiff > 0``)
+  - A read of this lane does not depend on the preceding store (``%pointerDiff < 0``)
+  - ``lane`` is in the range ``[0, VF)``
 
 Examples:
 """""""""
 
 .. code-block:: llvm
 
-      %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4)
-      call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask)
+      %aAddr = ptrtoaddr ptr %ptrA to i64
+      %bAddr = ptrtoaddr ptr %ptrB to i64
+      %pointerDiff = sub i64 %bAddr, %aAddr
+      %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(i64 %pointerDiff, i64 4)
+      call @llvm.masked.store.v4i32.v4i32.p0(<4 x i32> %vecA, ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask)
       [...]
-      %vecB = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
+      %vecB = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
 
       ; For the above example, consider the following cases:
       ;
-      ; 1. ptrA == ptrB
+      ; 1. ptrA == ptrB (pointerDiff == 0)
       ;
       ;  store = <0,1,2,3>       ; array[i] = store;
       ;   load = <0,1,2,3>       ; uint32_t load = array[i];
       ;
       ; This results in a all-true mask. There is no conflict.
       ;
-      ; 2. ptrB - ptrA = 2 * elementSize
+      ; 2. pointerDiff = 2 * elementSize
       ;
       ;  store =  <0,1,2,3>      ; array[i] = store;
       ;   load =      <0,1,2,3>  ; uint32_t load = array[i+2];
       ;
       ; This results in a mask with the first two lanes active. In this case,
       ; only two lanes can be written without overwriting values yet to be read.
       ;
-      ; 3. ptrB - ptrA = -2 * elementSize
+      ; 3. pointerDiff = -2 * elementSize
       ;
       ;  store =      <0,1,2,3>  ; array[i+2] = store;
       ;   load =  <0,1,2,3>      ; uint32_t load = array[i];

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2190,46 +2190,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // The possible expansions are...
       //
       // loop_dependence_war_mask:
-      //   diff = (ptrB - ptrA) / eltSize
-      //   cmp = icmp sle diff, 0
+      //   cmp = icmp sle (diff / eltSize), 0
       //   upper_bound = select cmp, -1, diff
       //   mask = get_active_lane_mask 0, upper_bound
       //
       // loop_dependence_raw_mask:
-      //   diff = (abs(ptrB - ptrA)) / eltSize
-      //   cmp = icmp eq diff, 0
+      //   cmp = icmp eq (diff / eltSize), 0
       //   upper_bound = select cmp, -1, diff
       //   mask = get_active_lane_mask 0, upper_bound
       //
-      auto *PtrTy = cast<PointerType>(ICA.getArgTypes()[0]);
-      Type *IntPtrTy = IntegerType::getIntNTy(
-          RetTy->getContext(), thisT()->getDataLayout().getPointerSizeInBits(
-                                   PtrTy->getAddressSpace()));
+      Type *IntTy = ICA.getArgTypes()[0];
       bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
 
-      InstructionCost Cost =
-          thisT()->getArithmeticInstrCost(Instruction::Sub, IntPtrTy, CostKind);
+      TTI::OperandValueInfo EltSizeOpInfo =
+          TTI::getOperandInfo(ICA.getArgs()[1]);
+      InstructionCost Cost = thisT()->getArithmeticInstrCost(
+          Instruction::SDiv, IntTy, CostKind, {}, EltSizeOpInfo);
+
       if (IsReadAfterWrite) {
-        IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, IntPtrTy, {IntPtrTy},
-                                         {});
+        IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, IntTy, {IntTy}, {});
         Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
       }
 
-      TTI::OperandValueInfo EltSizeOpInfo =
-          TTI::getOperandInfo(ICA.getArgs()[2]);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::SDiv, IntPtrTy,
-                                              CostKind, {}, EltSizeOpInfo);
-
       Type *CondTy = IntegerType::getInt1Ty(RetTy->getContext());
       CmpInst::Predicate Pred =
           IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE;
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CondTy,
-                                          IntPtrTy, Pred, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, IntPtrTy,
-                                          CondTy, Pred, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CondTy, IntTy,
+                                          Pred, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, IntTy, CondTy,
+                                          Pred, CostKind);
 
       IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
-                                    {IntPtrTy, IntPtrTy}, FMF);
+                                    {IntTy, IntTy}, FMF);
       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
       return Cost;
     }

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
@@ -2511,13 +2511,13 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>] in {
 
 def int_loop_dependence_raw_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-            [llvm_ptr_ty, llvm_ptr_ty, llvm_i64_ty],
-            [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<2>>]>;
+            [llvm_anyint_ty, llvm_i64_ty],
+            [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
 
 def int_loop_dependence_war_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-            [llvm_ptr_ty, llvm_ptr_ty, llvm_i64_ty],
-            [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<2>>]>;
+            [llvm_anyint_ty, llvm_i64_ty],
+            [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
 
 def int_get_active_lane_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -535,6 +535,7 @@ namespace {
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
+    SDValue visitLOOP_DEPENDENCE_MASK(SDNode *N);
 
     SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
     SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
@@ -2095,6 +2096,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMINIMUM:     return visitVECREDUCE(N);
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+    return visitLOOP_DEPENDENCE_MASK(N);
 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC:
 #include "llvm/IR/VPIntrinsics.def"
     return visitVPOp(N);
@@ -21132,6 +21136,22 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   return SDValue();
 }
 
+// Fold LOOP_DEPENDENCE_MASK(0, sub(B, A)) to LOOP_DEPENDENCE_MASK(A, B).
+SDValue DAGCombiner::visitLOOP_DEPENDENCE_MASK(SDNode *N) {
+  auto *Op0Const = dyn_cast<ConstantSDNode>(N->getOperand(0));
+  if (!Op0Const || !Op0Const->isZero())
+    return SDValue();
+
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  SDValue Op10 = Op1->getOperand(0);
+  SDValue Op11 = Op1->getOperand(1);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op11, Op10,
+                     N->getOperand(2), N->getOperand(3));
+}
+
 namespace {
 
 /// Helper structure used to slice a load in smaller loads.

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8511,18 +8511,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::loop_dependence_war_mask:
-    setValue(&I,
-             DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl,
-                         EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
-                         getValue(I.getOperand(1)), getValue(I.getOperand(2)),
-                         DAG.getConstant(0, sdl, MVT::i64)));
-    return;
   case Intrinsic::loop_dependence_raw_mask:
-    setValue(&I,
-             DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl,
-                         EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
-                         getValue(I.getOperand(1)), getValue(I.getOperand(2)),
-                         DAG.getConstant(0, sdl, MVT::i64)));
+    unsigned Opcode = Intrinsic == Intrinsic::loop_dependence_war_mask
+                          ? ISD::LOOP_DEPENDENCE_WAR_MASK
+                          : ISD::LOOP_DEPENDENCE_RAW_MASK;
+    SDValue PointerDiff = getValue(I.getOperand(0));
+    SDValue Zero = DAG.getConstant(0, sdl, PointerDiff.getValueType());
+    setValue(&I, DAG.getNode(Opcode, sdl, EVT::getEVT(I.getType()), Zero,
+                             PointerDiff, getValue(I.getOperand(1)),
+                             DAG.getConstant(0, sdl, MVT::i64)));
     return;
   }
 }

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5578,6 +5578,19 @@ AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
 
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+
+  // Handle operands less than 64-bit (the diff must be sign extended).
+  if (Op0.getValueType() != MVT::i64) {
+    assert(Op0.getValueSizeInBits() < 64);
+    SDValue Op1 = Op.getOperand(1);
+    SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+    SDValue Diff = DAG.getNode(ISD::SUB, DL, Op0.getValueType(), Op1, Op0);
+    SDValue DiffExt = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Diff);
+    return DAG.getNode(Op.getOpcode(), DL, VT,
+                       {Zero, DiffExt, Op.getOperand(2), Op.getOperand(3)});
+  }
+
   unsigned LaneOffset = Op.getConstantOperandVal(3);
   unsigned NumElements = VT.getVectorMinNumElements();
   uint64_t EltSizeInBytes = Op.getConstantOperandVal(2);