From 21688bd93f56aa6babd2835ef306093ad5557eec Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 8 Aug 2025 11:06:12 +0000 Subject: [PATCH 1/7] [IR] Add llvm.masked.load.first.fault intrinsic In order to support loading from addresses which may not be valid at runtime without generating faults, we introduce the first fault load intrinsic. Loading with this intrinsic will only generate a fault for invalid accesses on the first element of the vector. Any subsequent fault will be suppressed and the corresponding data will be poison. This PR contains target-independent scalarization of the intrinsic so that generic codegen works. --- llvm/docs/LangRef.rst | 61 ++++++++++ llvm/include/llvm/IR/Intrinsics.td | 8 ++ .../Scalar/ScalarizeMaskedMemIntrin.cpp | 96 +++++++++++++++- .../AArch64/masked-load-first-faulting.ll | 108 ++++++++++++++++++ .../expand-masked-load-first-fault.ll | 82 +++++++++++++ 5 files changed, 352 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll create mode 100644 llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 45a22f416dce1..f5311c7c447c9 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -27809,6 +27809,67 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations. +.. _int_mloadff: + +'``llvm.masked.load.first.fault.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer, +floating-point or pointer data type. + +:: + + declare { <16 x float>, <16 x i1> } @llvm.masked.load.first.fault.v16f32.p0(ptr , i32 , <16 x i1> ) + declare { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault.v2f64.p0(ptr , i32 , <2 x i1> ) + ;; The data is a vector of pointers + declare { <8 x ptr>, <8 x i1> } @llvm.masked.load.first.fault.v8p0.p0(ptr , i32 , <8 x i1> ) + +Overview: +""""""""" + +Reads a vector from memory according to the provided mask, suppressing faults +for any lane beyond the first. The mask holds a bit for each vector lane, and +is used to prevent memory accesses to the masked-off lanes. + +Returns the loaded data and a mask indicating which lanes are valid, which may +not be the same as the input mask depending on whether the processor encountered +a reason to avoid loading from that address. Invalid lanes contain poison +values. + +Arguments: +"""""""""" + +The first argument is the base pointer for the load. The second argument is the +alignment of the source location. It must be a power of two constant integer +value. The third argument, mask, is a vector of boolean values with the same +number of elements as the return type. + +Semantics: +"""""""""" + +The '``llvm.masked.load.first.fault``' intrinsic is similar to the +'``llvm.masked.load``' intrinsic, in that it conditionally loads values from +memory into a vector based on a mask. However, it allows loading from addresses +which may not be entirely safe. If the memory corresponding to the first element +of the vector is inaccessible, then a fault will be raised as normal. For all +subsequent lanes faults will be suppressed and the corresponding bit in the +output mask will be marked inactive. The remaining elements in the output mask +after a suppressed fault will also be marked inactive. All elements in the data +result (first vector in the returned struct) with a corresponding element in the +mask result (second vector in the returned struct) set to inactive contain +poison values. + +Reasons for marking output elements inactive are processor dependent; it may be +a genuine fault, e.g. if the range of the data being loaded spans a page +boundary and the page at the higher address is not mapped. But a given +processor may also mark elements as inactive for other reasons, such as a cache +miss. Code using this intrinsic must take this into account and not assume that +inactive lanes signal the end of accessible memory. If more data should be +loaded based on the semantics of the user code, then the base pointer should be +advanced to the address of the first inactive element and a new first fault load +attempted. Memory Use Markers ------------------ diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4469ff155b854..9fc7e0df9a1cd 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2598,6 +2598,14 @@ def int_masked_compressstore: [IntrWriteMem, IntrArgMemOnly, NoCapture>]>; +def int_masked_load_first_fault: + DefaultAttrsIntrinsic<[llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [llvm_anyptr_ty, llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrReadMem, IntrArgMemOnly, ImmArg>, + NoCapture>]>; + def int_experimental_vector_compress: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index da9ceb4f440e5..c1f0a81183f8c 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -1040,6 +1040,85 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, ModifiedDT = true; } +static void scalarizeMaskedFirstFaultingLoad(const DataLayout &DL, CallInst *CI, + DomTreeUpdater *DTU, + bool &ModifiedDT) { + // For a target without first-faulting load support, we can't actually + // scalarize accesses for all lanes. However, lanes beyond the first may be + // considered inactive due to reasons beyond a fault, so for generic + // 'scalarization' we can just load the first lane (if the corresponding + // input mask bit is active), then mark all other lanes as inactive in the + // output mask and embed the first lane into a vector of poison. + Value *Ptr = CI->getArgOperand(0); + Value *Align = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + StructType *RetTy = cast(CI->getType()); + VectorType *DataTy = cast(RetTy->getElementType(0)); + VectorType *MaskTy = cast(RetTy->getElementType(1)); + Type *ScalarTy = DataTy->getScalarType(); + + MaybeAlign AlignVal = cast(Align)->getMaybeAlignValue(); + + IRBuilder<> Builder(CI->getContext()); + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + Value *EmptyMask = Constant::getNullValue(MaskTy); + Value *PoisonData = PoisonValue::get(DataTy); + + // First create a check to determine whether the first lane is active + // + // %first.active = extractelement %mask, i64 0 + // br i1 %first.active, label %load.ff.first.lane, label %load.ff.result + Value *FirstActive = + Builder.CreateExtractElement(Mask, 0ul, Twine("first.active")); + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(FirstActive, CI, + /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DTU); + + // If the first mask lane was active, then we want a real load of one element + // into the first element of a vector, with the rest being poison. + // + // load.ff.first.lane: + // %ld.first = load ty, ptr %Ptr + // %lane = insertelement poison, ty %ld.first, i64 0 + // br label %load.ff.result + BasicBlock *ThenBlock = ThenTerm->getParent(); + ThenBlock->setName("load.ff.first.lane"); + Builder.SetInsertPoint(ThenBlock->getTerminator()); + LoadInst *Load = Builder.CreateAlignedLoad(ScalarTy, Ptr, AlignVal); + Value *OneLaneData = Builder.CreateInsertElement(PoisonData, Load, 0ul); + Value *OneLaneMask = Builder.CreateInsertElement( + EmptyMask, Constant::getAllOnesValue(MaskTy->getElementType()), 0ul); + + // Now we just select between the two based on the check of the first lane + // + // load.ff.result: + // %data.res = phi [ poison, %orig ], [ %lane, %load.ff.first.lane ] + // %mask.res = phi [ false, %orig ], [ , %ld.ff... ] + // %ins = insertvalue { , } poison, %data.res, 0 + // %first.lane.only = insertvalue { , } %ins, ...,1 + // ... replace all intrinsic uses with %first.lane.only + Builder.SetInsertPoint(CI); + PHINode *ResData = Builder.CreatePHI(DataTy, 2); + ResData->addIncoming(PoisonData, IfBlock); + ResData->addIncoming(OneLaneData, ThenBlock); + PHINode *ResMask = Builder.CreatePHI(MaskTy, 2); + ResMask->addIncoming(EmptyMask, IfBlock); + ResMask->addIncoming(OneLaneMask, ThenBlock); + + Value *Result = PoisonValue::get(RetTy); + Result = Builder.CreateInsertValue(Result, ResData, 0ul); + Result = Builder.CreateInsertValue(Result, ResMask, 1ul); + if (CI->hasName()) + Result->setName(CI->getName() + ".first.lane.only"); + CI->getParent()->setName("load.ff.result"); + CI->replaceAllUsesWith(Result); + CI->eraseFromParent(); + ModifiedDT = true; +} + static bool runImpl(Function &F, const TargetTransformInfo &TTI, DominatorTree *DT) { std::optional DTU; @@ -1110,11 +1189,18 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, DomTreeUpdater *DTU) { IntrinsicInst *II = dyn_cast(CI); if (II) { - // The scalarization code below does not work for scalable vectors. + // The scalarization code below does not work for scalable vectors, except + // for first faulting loads, which only need to deal with the first element. if (isa(II->getType()) || - any_of(II->args(), - [](Value *V) { return isa(V->getType()); })) + any_of(II->args(), [](Value *V) { + return isa(V->getType()); + })) { + if (II->getIntrinsicID() == Intrinsic::masked_load_first_fault) { + scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT); + return true; + } return false; + } switch (II->getIntrinsicID()) { default: break; @@ -1185,6 +1271,10 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, scalarizeMaskedCompressStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; + case Intrinsic::masked_load_first_fault: { + scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT); + return true; + } } } diff --git a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll new file mode 100644 index 0000000000000..3d2f8c705ba40 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O3 -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefix=NEON +; RUN: llc -O3 -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefix=SVE +; RUN: llc -O3 -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefix=SME_STREAMING + +define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { +; NEON-LABEL: load_ff_v4i32: +; NEON: // %bb.0: +; NEON-NEXT: // kill: def $d0 killed $d0 def $q0 +; NEON-NEXT: umov w8, v0.h[0] +; NEON-NEXT: tbz w8, #0, .LBB0_2 +; NEON-NEXT: // %bb.1: // %load.ff.first.lane +; NEON-NEXT: adrp x8, .LCPI0_0 +; NEON-NEXT: ldr s0, [x0] +; NEON-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; NEON-NEXT: // kill: def $d1 killed $d1 killed $q1 +; NEON-NEXT: ret +; NEON-NEXT: .LBB0_2: +; NEON-NEXT: movi v1.2d, #0000000000000000 +; NEON-NEXT: // implicit-def: $q0 +; NEON-NEXT: // kill: def $d1 killed $d1 killed $q1 +; NEON-NEXT: ret +; +; SVE-LABEL: load_ff_v4i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; SVE-NEXT: umov w8, v0.h[0] +; SVE-NEXT: tbz w8, #0, .LBB0_2 +; SVE-NEXT: // %bb.1: // %load.ff.first.lane +; SVE-NEXT: adrp x8, .LCPI0_0 +; SVE-NEXT: ldr s0, [x0] +; SVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; SVE-NEXT: // kill: def $d1 killed $d1 killed $q1 +; SVE-NEXT: ret +; SVE-NEXT: .LBB0_2: +; SVE-NEXT: movi v1.2d, #0000000000000000 +; SVE-NEXT: // implicit-def: $q0 +; SVE-NEXT: // kill: def $d1 killed $d1 killed $q1 +; SVE-NEXT: ret +; +; SME_STREAMING-LABEL: load_ff_v4i32: +; SME_STREAMING: // %bb.0: +; SME_STREAMING-NEXT: fmov w8, s0 +; SME_STREAMING-NEXT: tbz w8, #0, .LBB0_2 +; SME_STREAMING-NEXT: // %bb.1: // %load.ff.first.lane +; SME_STREAMING-NEXT: ptrue p0.s +; SME_STREAMING-NEXT: adrp x8, .LCPI0_1 +; SME_STREAMING-NEXT: ldr d1, [x8, :lo12:.LCPI0_1] +; SME_STREAMING-NEXT: ld1rw { z0.s }, p0/z, [x0] +; SME_STREAMING-NEXT: ret +; SME_STREAMING-NEXT: .LBB0_2: +; SME_STREAMING-NEXT: mov z1.h, #0 // =0x0 +; SME_STREAMING-NEXT: adrp x8, .LCPI0_0 +; SME_STREAMING-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] +; SME_STREAMING-NEXT: ret + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <4 x i1> %mask) + ret { <4 x i32>, <4 x i1> } %res +} + +define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_fully_aligned(ptr %p) { +; NEON-LABEL: load_ff_v2f64_all_true_fully_aligned: +; NEON: // %bb.0: // %load.ff.first.lane +; NEON-NEXT: adrp x8, .LCPI1_0 +; NEON-NEXT: ldr d0, [x0] +; NEON-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] +; NEON-NEXT: ret +; +; SVE-LABEL: load_ff_v2f64_all_true_fully_aligned: +; SVE: // %bb.0: // %load.ff.first.lane +; SVE-NEXT: ldr d0, [x0] +; SVE-NEXT: index z1.s, #1, #-1 +; SVE-NEXT: // kill: def $d1 killed $d1 killed $z1 +; SVE-NEXT: ret +; +; SME_STREAMING-LABEL: load_ff_v2f64_all_true_fully_aligned: +; SME_STREAMING: // %bb.0: // %load.ff.first.lane +; SME_STREAMING-NEXT: ptrue p0.d +; SME_STREAMING-NEXT: index z1.s, #1, #-1 +; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] +; SME_STREAMING-NEXT: ret + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <2 x i1> ) + ret { <2 x double>, <2 x i1> } %res +} + +define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_partially_aligned(ptr %p) { +; NEON-LABEL: load_ff_v2f64_all_true_partially_aligned: +; NEON: // %bb.0: // %load.ff.first.lane +; NEON-NEXT: adrp x8, .LCPI2_0 +; NEON-NEXT: ldr d0, [x0] +; NEON-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; NEON-NEXT: ret +; +; SVE-LABEL: load_ff_v2f64_all_true_partially_aligned: +; SVE: // %bb.0: // %load.ff.first.lane +; SVE-NEXT: ldr d0, [x0] +; SVE-NEXT: index z1.s, #1, #-1 +; SVE-NEXT: // kill: def $d1 killed $d1 killed $z1 +; SVE-NEXT: ret +; +; SME_STREAMING-LABEL: load_ff_v2f64_all_true_partially_aligned: +; SME_STREAMING: // %bb.0: // %load.ff.first.lane +; SME_STREAMING-NEXT: ptrue p0.d +; SME_STREAMING-NEXT: index z1.s, #1, #-1 +; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] +; SME_STREAMING-NEXT: ret + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 8, <2 x i1> ) + ret { <2 x double>, <2 x i1> } %res +} diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll new file mode 100644 index 0000000000000..ed78888e40691 --- /dev/null +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -p scalarize-masked-mem-intrin -S < %s | FileCheck %s + +define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { +; CHECK-LABEL: define { <4 x i32>, <4 x i1> } @load_ff_v4i32( +; CHECK-SAME: ptr [[P:%.*]], <4 x i1> [[MASK:%.*]]) { +; CHECK-NEXT: [[FIRST_ACTIVE:%.*]] = extractelement <4 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[FIRST_ACTIVE]], label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i1> [ zeroinitializer, [[TMP0]] ], [ , %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[TMP3]], 0 +; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[TMP5]], <4 x i1> [[TMP4]], 1 +; CHECK-NEXT: ret { <4 x i32>, <4 x i1> } [[RES_FIRST_LANE_ONLY]] +; + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <4 x i1> %mask) + ret { <4 x i32>, <4 x i1> } %res +} + +;; We can 'scalarize' first faulting loads for scalable vectors, since we only +;; need to insert a single element into the start of a poison splat vector. +define { , } @load_ff_nxv4i32(ptr %p, %mask) { +; CHECK-LABEL: define { , } @load_ff_nxv4i32( +; CHECK-SAME: ptr [[P:%.*]], [[MASK:%.*]]) { +; CHECK-NEXT: [[FIRST_ACTIVE:%.*]] = extractelement [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[FIRST_ACTIVE]], label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, [[TMP0]] ], [ insertelement ( zeroinitializer, i1 true, i64 0), %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { , } [[TMP5]], [[TMP4]], 1 +; CHECK-NEXT: ret { , } [[RES]] +; + %res = call { , } @llvm.masked.load.first.fault(ptr %p, i32 16, %mask) + ret { , } %res +} + +define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true(ptr %p) { +; CHECK-LABEL: define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: br i1 true, label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x double> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i1> [ zeroinitializer, [[TMP0]] ], [ , %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <2 x double>, <2 x i1> } poison, <2 x double> [[TMP3]], 0 +; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <2 x double>, <2 x i1> } [[TMP5]], <2 x i1> [[TMP4]], 1 +; CHECK-NEXT: ret { <2 x double>, <2 x i1> } [[RES_FIRST_LANE_ONLY]] +; + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <2 x i1> ) + ret { <2 x double>, <2 x i1> } %res +} + +define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false(ptr %p) { +; CHECK-LABEL: define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: br i1 false, label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[P]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <16 x i16> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <16 x i1> [ zeroinitializer, [[TMP0]] ], [ , %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <16 x i16>, <16 x i1> } poison, <16 x i16> [[TMP3]], 0 +; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <16 x i16>, <16 x i1> } [[TMP5]], <16 x i1> [[TMP4]], 1 +; CHECK-NEXT: ret { <16 x i16>, <16 x i1> } [[RES_FIRST_LANE_ONLY]] +; + %res = call { <16 x i16>, <16 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 32, <16 x i1> zeroinitializer) + ret { <16 x i16>, <16 x i1> } %res +} From c1492bb615453ca57708d2347ce74b7b151a0924 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 10 Feb 2026 11:56:47 +0000 Subject: [PATCH 2/7] Reword langref entry based on comments --- llvm/docs/LangRef.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index f5311c7c447c9..edc784d8a023f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -27854,22 +27854,22 @@ The '``llvm.masked.load.first.fault``' intrinsic is similar to the memory into a vector based on a mask. However, it allows loading from addresses which may not be entirely safe. If the memory corresponding to the first element of the vector is inaccessible, then a fault will be raised as normal. For all -subsequent lanes faults will be suppressed and the corresponding bit in the -output mask will be marked inactive. The remaining elements in the output mask -after a suppressed fault will also be marked inactive. All elements in the data -result (first vector in the returned struct) with a corresponding element in the -mask result (second vector in the returned struct) set to inactive contain -poison values. +subsequent lanes, if a fault occurs, it will be suppressed and the corresponding +bit in the output mask will be marked inactive. The remaining elements in the +output mask after a suppressed fault will also be marked inactive. All elements +in the data result (first vector in the returned struct) with a corresponding +element in the mask result (second vector in the returned struct) set to +inactive contain poison values. Reasons for marking output elements inactive are processor dependent; it may be a genuine fault, e.g. if the range of the data being loaded spans a page -boundary and the page at the higher address is not mapped. But a given -processor may also mark elements as inactive for other reasons, such as a cache -miss. Code using this intrinsic must take this into account and not assume that -inactive lanes signal the end of accessible memory. If more data should be -loaded based on the semantics of the user code, then the base pointer should be -advanced to the address of the first inactive element and a new first fault load -attempted. +boundary and the page at the higher address is not mapped. It may also be due to +the hardware lacking a way of suppressing faults. But a given processor may also +mark elements as inactive for other reasons, such as a cache miss. Code using +this intrinsic must take this into account and not assume that inactive lanes +signal the end of accessible memory. If more data should be loaded based on the +semantics of the user code, then the base pointer should be advanced to the +address of the first inactive element and a new first fault load attempted. Memory Use Markers ------------------ From ca5d3a6a01eeac13ce2c7664110d19216c2170b6 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 10 Feb 2026 13:51:42 +0000 Subject: [PATCH 3/7] Try to fix windows build --- llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index c1f0a81183f8c..a91e63d979e7c 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -1071,7 +1071,7 @@ static void scalarizeMaskedFirstFaultingLoad(const DataLayout &DL, CallInst *CI, // %first.active = extractelement %mask, i64 0 // br i1 %first.active, label %load.ff.first.lane, label %load.ff.result Value *FirstActive = - Builder.CreateExtractElement(Mask, 0ul, Twine("first.active")); + Builder.CreateExtractElement(Mask, uint64_t(0ull), Twine("first.active")); Instruction *ThenTerm = SplitBlockAndInsertIfThen(FirstActive, CI, /*Unreachable=*/false, @@ -1088,9 +1088,11 @@ static void scalarizeMaskedFirstFaultingLoad(const DataLayout &DL, CallInst *CI, ThenBlock->setName("load.ff.first.lane"); Builder.SetInsertPoint(ThenBlock->getTerminator()); LoadInst *Load = Builder.CreateAlignedLoad(ScalarTy, Ptr, AlignVal); - Value *OneLaneData = Builder.CreateInsertElement(PoisonData, Load, 0ul); + Value *OneLaneData = + Builder.CreateInsertElement(PoisonData, Load, uint64_t(0ull)); Value *OneLaneMask = Builder.CreateInsertElement( - EmptyMask, Constant::getAllOnesValue(MaskTy->getElementType()), 0ul); + EmptyMask, Constant::getAllOnesValue(MaskTy->getElementType()), + uint64_t(0ull)); // Now we just select between the two based on the check of the first lane // From c517f9f3ed5f237d8a5dfe8a5b5b2a142c1ef91c Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 17 Feb 2026 15:22:51 +0000 Subject: [PATCH 4/7] Renamed intrinsic --- llvm/docs/LangRef.rst | 32 +++++++++---------- llvm/include/llvm/IR/Intrinsics.td | 2 +- .../Scalar/ScalarizeMaskedMemIntrin.cpp | 4 +-- .../AArch64/masked-load-first-faulting.ll | 6 ++-- .../expand-masked-load-first-fault.ll | 8 ++--- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index edc784d8a023f..e1154c3757264 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -27811,8 +27811,8 @@ Other targets may support this intrinsic differently, for example, by lowering i .. _int_mloadff: -'``llvm.masked.load.first.fault.*``' Intrinsics -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.masked.load.ff.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" @@ -27821,10 +27821,10 @@ floating-point or pointer data type. :: - declare { <16 x float>, <16 x i1> } @llvm.masked.load.first.fault.v16f32.p0(ptr , i32 , <16 x i1> ) - declare { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault.v2f64.p0(ptr , i32 , <2 x i1> ) + declare { <16 x float>, <16 x i1> } @llvm.masked.load.ff.v16f32.p0(ptr , i32 , <16 x i1> ) + declare { <2 x double>, <2 x i1> } @llvm.masked.load.ff.v2f64.p0(ptr , i32 , <2 x i1> ) ;; The data is a vector of pointers - declare { <8 x ptr>, <8 x i1> } @llvm.masked.load.first.fault.v8p0.p0(ptr , i32 , <8 x i1> ) + declare { <8 x ptr>, <8 x i1> } @llvm.masked.load.ff.v8p0.p0(ptr , i32 , <8 x i1> ) Overview: """"""""" @@ -27849,17 +27849,17 @@ number of elements as the return type. Semantics: """""""""" -The '``llvm.masked.load.first.fault``' intrinsic is similar to the -'``llvm.masked.load``' intrinsic, in that it conditionally loads values from -memory into a vector based on a mask. However, it allows loading from addresses -which may not be entirely safe. If the memory corresponding to the first element -of the vector is inaccessible, then a fault will be raised as normal. For all -subsequent lanes, if a fault occurs, it will be suppressed and the corresponding -bit in the output mask will be marked inactive. The remaining elements in the -output mask after a suppressed fault will also be marked inactive. All elements -in the data result (first vector in the returned struct) with a corresponding -element in the mask result (second vector in the returned struct) set to -inactive contain poison values. +The '``llvm.masked.load.ff``' intrinsic is similar to the '``llvm.masked.load``' +intrinsic, in that it conditionally loads values from memory into a vector based +on a mask. However, it allows loading from addresses which may not be entirely +safe. If the memory corresponding to the first element of the vector is +inaccessible, then a fault will be raised as normal. For all subsequent lanes, +if a fault occurs, it will be suppressed and the corresponding bit in the output +mask will be marked inactive. The remaining elements in the output mask after a +suppressed fault will also be marked inactive. All elements in the data result +(first vector in the returned struct) with a corresponding element in the mask +result (second vector in the returned struct) set to inactive contain poison +values. Reasons for marking output elements inactive are processor dependent; it may be a genuine fault, e.g. if the range of the data being loaded spans a page diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 9fc7e0df9a1cd..000ed0bae37f3 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2598,7 +2598,7 @@ def int_masked_compressstore: [IntrWriteMem, IntrArgMemOnly, NoCapture>]>; -def int_masked_load_first_fault: +def int_masked_load_ff: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyptr_ty, llvm_i32_ty, diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index a91e63d979e7c..2d093f75591c7 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -1197,7 +1197,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, any_of(II->args(), [](Value *V) { return isa(V->getType()); })) { - if (II->getIntrinsicID() == Intrinsic::masked_load_first_fault) { + if (II->getIntrinsicID() == Intrinsic::masked_load_ff) { scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT); return true; } @@ -1273,7 +1273,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, scalarizeMaskedCompressStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; - case Intrinsic::masked_load_first_fault: { + case Intrinsic::masked_load_ff: { scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT); return true; } diff --git a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll index 3d2f8c705ba40..4f0e6e152ec80 100644 --- a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll +++ b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll @@ -53,7 +53,7 @@ define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { ; SME_STREAMING-NEXT: adrp x8, .LCPI0_0 ; SME_STREAMING-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] ; SME_STREAMING-NEXT: ret - %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <4 x i1> %mask) + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <4 x i1> %mask) ret { <4 x i32>, <4 x i1> } %res } @@ -78,7 +78,7 @@ define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_fully_aligned(ptr %p) ; SME_STREAMING-NEXT: index z1.s, #1, #-1 ; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] ; SME_STREAMING-NEXT: ret - %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <2 x i1> ) + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <2 x i1> ) ret { <2 x double>, <2 x i1> } %res } @@ -103,6 +103,6 @@ define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_partially_aligned(ptr ; SME_STREAMING-NEXT: index z1.s, #1, #-1 ; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] ; SME_STREAMING-NEXT: ret - %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 8, <2 x i1> ) + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr %p, i32 8, <2 x i1> ) ret { <2 x double>, <2 x i1> } %res } diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll index ed78888e40691..85b6462427a5f 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll @@ -17,7 +17,7 @@ define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { ; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[TMP5]], <4 x i1> [[TMP4]], 1 ; CHECK-NEXT: ret { <4 x i32>, <4 x i1> } [[RES_FIRST_LANE_ONLY]] ; - %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <4 x i1> %mask) + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <4 x i1> %mask) ret { <4 x i32>, <4 x i1> } %res } @@ -39,7 +39,7 @@ define { , } @load_ff_nxv4i32(ptr %p, , } [[TMP5]], [[TMP4]], 1 ; CHECK-NEXT: ret { , } [[RES]] ; - %res = call { , } @llvm.masked.load.first.fault(ptr %p, i32 16, %mask) + %res = call { , } @llvm.masked.load.ff(ptr %p, i32 16, %mask) ret { , } %res } @@ -58,7 +58,7 @@ define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true(ptr %p) { ; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <2 x double>, <2 x i1> } [[TMP5]], <2 x i1> [[TMP4]], 1 ; CHECK-NEXT: ret { <2 x double>, <2 x i1> } [[RES_FIRST_LANE_ONLY]] ; - %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 16, <2 x i1> ) + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <2 x i1> ) ret { <2 x double>, <2 x i1> } %res } @@ -77,6 +77,6 @@ define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false(ptr %p) { ; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <16 x i16>, <16 x i1> } [[TMP5]], <16 x i1> [[TMP4]], 1 ; CHECK-NEXT: ret { <16 x i16>, <16 x i1> } [[RES_FIRST_LANE_ONLY]] ; - %res = call { <16 x i16>, <16 x i1> } @llvm.masked.load.first.fault(ptr %p, i32 32, <16 x i1> zeroinitializer) + %res = call { <16 x i16>, <16 x i1> } @llvm.masked.load.ff(ptr %p, i32 32, <16 x i1> zeroinitializer) ret { <16 x i16>, <16 x i1> } %res } From dcb8f374c16199f556720762e0175647d7f0af67 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 17 Feb 2026 16:04:48 +0000 Subject: [PATCH 5/7] Moved alignment to parameter attribute --- llvm/docs/LangRef.rst | 16 +++++++++------- llvm/include/llvm/IR/Intrinsics.td | 7 +++---- .../Scalar/ScalarizeMaskedMemIntrin.cpp | 6 ++---- .../AArch64/masked-load-first-faulting.ll | 6 +++--- .../expand-masked-load-first-fault.ll | 8 ++++---- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index e1154c3757264..3c0e5e87dc246 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -27821,10 +27821,10 @@ floating-point or pointer data type. :: - declare { <16 x float>, <16 x i1> } @llvm.masked.load.ff.v16f32.p0(ptr , i32 , <16 x i1> ) - declare { <2 x double>, <2 x i1> } @llvm.masked.load.ff.v2f64.p0(ptr , i32 , <2 x i1> ) + declare { <16 x float>, <16 x i1> } @llvm.masked.load.ff.v16f32.p0(ptr , <16 x i1> ) + declare { <2 x double>, <2 x i1> } @llvm.masked.load.ff.v2f64.p0(ptr , <2 x i1> ) ;; The data is a vector of pointers - declare { <8 x ptr>, <8 x i1> } @llvm.masked.load.ff.v8p0.p0(ptr , i32 , <8 x i1> ) + declare { <8 x ptr>, <8 x i1> } @llvm.masked.load.ff.v8p0.p0(ptr align 8 , <8 x i1> ) Overview: """"""""" @@ -27841,10 +27841,12 @@ values. Arguments: """""""""" -The first argument is the base pointer for the load. The second argument is the -alignment of the source location. It must be a power of two constant integer -value. The third argument, mask, is a vector of boolean values with the same -number of elements as the return type. +The first argument is the base pointer for the load. The second argument, mask, +is a vector of boolean values with the same number of elements as the return +type. + +The :ref:`align ` parameter attribute can be provided for the first +argument. Semantics: """""""""" diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 000ed0bae37f3..1d9267487a886 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2601,10 +2601,9 @@ def int_masked_compressstore: def int_masked_load_ff: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [llvm_anyptr_ty, llvm_i32_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrReadMem, IntrArgMemOnly, ImmArg>, - NoCapture>]>; + [llvm_anyptr_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrReadMem, IntrArgMemOnly, NoCapture>]>; def int_experimental_vector_compress: DefaultAttrsIntrinsic<[llvm_anyvector_ty], diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 2d093f75591c7..276685ea041a6 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -1050,15 +1050,13 @@ static void scalarizeMaskedFirstFaultingLoad(const DataLayout &DL, CallInst *CI, // input mask bit is active), then mark all other lanes as inactive in the // output mask and embed the first lane into a vector of poison. Value *Ptr = CI->getArgOperand(0); - Value *Align = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); + MaybeAlign AlignVal = CI->getParamAlign(0); + Value *Mask = CI->getArgOperand(1); StructType *RetTy = cast(CI->getType()); VectorType *DataTy = cast(RetTy->getElementType(0)); VectorType *MaskTy = cast(RetTy->getElementType(1)); Type *ScalarTy = DataTy->getScalarType(); - MaybeAlign AlignVal = cast(Align)->getMaybeAlignValue(); - IRBuilder<> Builder(CI->getContext()); BasicBlock *IfBlock = CI->getParent(); Builder.SetInsertPoint(CI); diff --git a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll index 4f0e6e152ec80..b00057a1e4724 100644 --- a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll +++ b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll @@ -53,7 +53,7 @@ define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { ; SME_STREAMING-NEXT: adrp x8, .LCPI0_0 ; SME_STREAMING-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] ; SME_STREAMING-NEXT: ret - %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <4 x i1> %mask) + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <4 x i1> %mask) ret { <4 x i32>, <4 x i1> } %res } @@ -78,7 +78,7 @@ define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_fully_aligned(ptr %p) ; SME_STREAMING-NEXT: index z1.s, #1, #-1 ; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] ; SME_STREAMING-NEXT: ret - %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <2 x i1> ) + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <2 x i1> ) ret { <2 x double>, <2 x i1> } %res } @@ -103,6 +103,6 @@ define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_partially_aligned(ptr ; SME_STREAMING-NEXT: index z1.s, #1, #-1 ; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] ; SME_STREAMING-NEXT: ret - %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr %p, i32 8, <2 x i1> ) + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 8 %p, <2 x i1> ) ret { <2 x double>, <2 x i1> } %res } diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll index 85b6462427a5f..3c74ab15f8e23 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll @@ -17,7 +17,7 @@ define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { ; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[TMP5]], <4 x i1> [[TMP4]], 1 ; CHECK-NEXT: ret { <4 x i32>, <4 x i1> } [[RES_FIRST_LANE_ONLY]] ; - %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <4 x i1> %mask) + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <4 x i1> %mask) ret { <4 x i32>, <4 x i1> } %res } @@ -39,7 +39,7 @@ define { , } @load_ff_nxv4i32(ptr %p, , } [[TMP5]], [[TMP4]], 1 ; CHECK-NEXT: ret { , } [[RES]] ; - %res = call { , } @llvm.masked.load.ff(ptr %p, i32 16, %mask) + %res = call { , } @llvm.masked.load.ff(ptr align 16 %p, %mask) ret { , } %res } @@ -58,7 +58,7 @@ define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true(ptr %p) { ; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <2 x double>, <2 x i1> } [[TMP5]], <2 x i1> [[TMP4]], 1 ; CHECK-NEXT: ret { <2 x double>, <2 x i1> } [[RES_FIRST_LANE_ONLY]] ; - %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr %p, i32 16, <2 x i1> ) + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <2 x i1> ) ret { <2 x double>, <2 x i1> } %res } @@ -77,6 +77,6 @@ define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false(ptr %p) { ; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <16 x i16>, <16 x i1> } [[TMP5]], <16 x i1> [[TMP4]], 1 ; CHECK-NEXT: ret { <16 x i16>, <16 x i1> } [[RES_FIRST_LANE_ONLY]] ; - %res = call { <16 x i16>, <16 x i1> } @llvm.masked.load.ff(ptr %p, i32 32, <16 x i1> zeroinitializer) + %res = call { <16 x i16>, <16 x i1> } @llvm.masked.load.ff(ptr align 32 %p, <16 x i1> zeroinitializer) ret { <16 x i16>, <16 x i1> } %res } From 15491cfbad88fa9284a97fb995866c7382b4a8cd Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 17 Feb 2026 16:16:12 +0000 Subject: [PATCH 6/7] Replace semantics section with link to vp.load.ff + differences --- llvm/docs/LangRef.rst | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 3c0e5e87dc246..79ea01d959312 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -27851,27 +27851,13 @@ argument. Semantics: """""""""" -The '``llvm.masked.load.ff``' intrinsic is similar to the '``llvm.masked.load``' -intrinsic, in that it conditionally loads values from memory into a vector based -on a mask. However, it allows loading from addresses which may not be entirely -safe. If the memory corresponding to the first element of the vector is -inaccessible, then a fault will be raised as normal. For all subsequent lanes, -if a fault occurs, it will be suppressed and the corresponding bit in the output -mask will be marked inactive. The remaining elements in the output mask after a -suppressed fault will also be marked inactive. All elements in the data result -(first vector in the returned struct) with a corresponding element in the mask -result (second vector in the returned struct) set to inactive contain poison -values. +The '``llvm.masked.load.ff``' intrinsic is very similar to the +'``llvm.vp.load.ff``' intrinsic, with the differences being the lack of an EVL +parameter and the second returned value being a mask instead of an updated EVL +value. -Reasons for marking output elements inactive are processor dependent; it may be -a genuine fault, e.g. if the range of the data being loaded spans a page -boundary and the page at the higher address is not mapped. It may also be due to -the hardware lacking a way of suppressing faults. But a given processor may also -mark elements as inactive for other reasons, such as a cache miss. Code using -this intrinsic must take this into account and not assume that inactive lanes -signal the end of accessible memory. If more data should be loaded based on the -semantics of the user code, then the base pointer should be advanced to the -address of the first inactive element and a new first fault load attempted. +If the processor suppresses a fault for any lane, then the returned mask will +indicate that lane and all subsequent lanes are inactive. Memory Use Markers ------------------ From bed61d93cc3b63a84d3988cde28fe55c29038231 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 17 Mar 2026 11:53:44 +0000 Subject: [PATCH 7/7] Update tests after rebase --- .../AArch64/masked-load-first-faulting.ll | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll index b00057a1e4724..0b8f831480f94 100644 --- a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll +++ b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll @@ -10,9 +10,9 @@ define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { ; NEON-NEXT: umov w8, v0.h[0] ; NEON-NEXT: tbz w8, #0, .LBB0_2 ; NEON-NEXT: // %bb.1: // %load.ff.first.lane -; NEON-NEXT: adrp x8, .LCPI0_0 +; NEON-NEXT: mov w8, #1 // =0x1 ; NEON-NEXT: ldr s0, [x0] -; NEON-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; NEON-NEXT: fmov d1, x8 ; NEON-NEXT: // kill: def $d1 killed $d1 killed $q1 ; NEON-NEXT: ret ; NEON-NEXT: .LBB0_2: @@ -27,9 +27,9 @@ define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { ; SVE-NEXT: umov w8, v0.h[0] ; SVE-NEXT: tbz w8, #0, .LBB0_2 ; SVE-NEXT: // %bb.1: // %load.ff.first.lane -; SVE-NEXT: adrp x8, .LCPI0_0 +; SVE-NEXT: mov w8, #1 // =0x1 ; SVE-NEXT: ldr s0, [x0] -; SVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; SVE-NEXT: fmov d1, x8 ; SVE-NEXT: // kill: def $d1 killed $d1 killed $q1 ; SVE-NEXT: ret ; SVE-NEXT: .LBB0_2: @@ -60,9 +60,9 @@ define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_fully_aligned(ptr %p) { ; NEON-LABEL: load_ff_v2f64_all_true_fully_aligned: ; NEON: // %bb.0: // %load.ff.first.lane -; NEON-NEXT: adrp x8, .LCPI1_0 +; NEON-NEXT: mov w8, #1 // =0x1 ; NEON-NEXT: ldr d0, [x0] -; NEON-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] +; NEON-NEXT: fmov d1, x8 ; NEON-NEXT: ret ; ; SVE-LABEL: load_ff_v2f64_all_true_fully_aligned: @@ -85,9 +85,9 @@ define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_fully_aligned(ptr %p) define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_partially_aligned(ptr %p) { ; NEON-LABEL: load_ff_v2f64_all_true_partially_aligned: ; NEON: // %bb.0: // %load.ff.first.lane -; NEON-NEXT: adrp x8, .LCPI2_0 +; NEON-NEXT: mov w8, #1 // =0x1 ; NEON-NEXT: ldr d0, [x0] -; NEON-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; NEON-NEXT: fmov d1, x8 ; NEON-NEXT: ret ; ; SVE-LABEL: load_ff_v2f64_all_true_partially_aligned: