diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 45a22f416dce1..79ea01d959312 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -27809,6 +27809,55 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations. +.. _int_mloadff: + +'``llvm.masked.load.ff.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer, +floating-point or pointer data type. + +:: + + declare { <16 x float>, <16 x i1> } @llvm.masked.load.ff.v16f32.p0(ptr , <16 x i1> ) + declare { <2 x double>, <2 x i1> } @llvm.masked.load.ff.v2f64.p0(ptr , <2 x i1> ) + ;; The data is a vector of pointers + declare { <8 x ptr>, <8 x i1> } @llvm.masked.load.ff.v8p0.p0(ptr align 8 , <8 x i1> ) + +Overview: +""""""""" + +Reads a vector from memory according to the provided mask, suppressing faults +for any lane beyond the first. The mask holds a bit for each vector lane, and +is used to prevent memory accesses to the masked-off lanes. + +Returns the loaded data and a mask indicating which lanes are valid, which may +not be the same as the input mask depending on whether the processor encountered +a reason to avoid loading from that address. Invalid lanes contain poison +values. + +Arguments: +"""""""""" + +The first argument is the base pointer for the load. The second argument, mask, +is a vector of boolean values with the same number of elements as the return +type. + +The :ref:`align ` parameter attribute can be provided for the first +argument. + +Semantics: +"""""""""" + +The '``llvm.masked.load.ff``' intrinsic is very similar to the +'``llvm.vp.load.ff``' intrinsic, with the differences being the lack of an EVL +parameter and the second returned value being a mask instead of an updated EVL +value. + +If the processor suppresses a fault for any lane, then the returned mask will +indicate that lane and all subsequent lanes are inactive. Memory Use Markers ------------------ diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4469ff155b854..1d9267487a886 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2598,6 +2598,13 @@ def int_masked_compressstore: [IntrWriteMem, IntrArgMemOnly, NoCapture>]>; +def int_masked_load_ff: + DefaultAttrsIntrinsic<[llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [llvm_anyptr_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrReadMem, IntrArgMemOnly, NoCapture>]>; + def int_experimental_vector_compress: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index da9ceb4f440e5..276685ea041a6 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -1040,6 +1040,85 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, ModifiedDT = true; } +static void scalarizeMaskedFirstFaultingLoad(const DataLayout &DL, CallInst *CI, + DomTreeUpdater *DTU, + bool &ModifiedDT) { + // For a target without first-faulting load support, we can't actually + // scalarize accesses for all lanes. However, lanes beyond the first may be + // considered inactive due to reasons beyond a fault, so for generic + // 'scalarization' we can just load the first lane (if the corresponding + // input mask bit is active), then mark all other lanes as inactive in the + // output mask and embed the first lane into a vector of poison. + Value *Ptr = CI->getArgOperand(0); + MaybeAlign AlignVal = CI->getParamAlign(0); + Value *Mask = CI->getArgOperand(1); + StructType *RetTy = cast(CI->getType()); + VectorType *DataTy = cast(RetTy->getElementType(0)); + VectorType *MaskTy = cast(RetTy->getElementType(1)); + Type *ScalarTy = DataTy->getScalarType(); + + IRBuilder<> Builder(CI->getContext()); + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + Value *EmptyMask = Constant::getNullValue(MaskTy); + Value *PoisonData = PoisonValue::get(DataTy); + + // First create a check to determine whether the first lane is active + // + // %first.active = extractelement %mask, i64 0 + // br i1 %first.active, label %load.ff.first.lane, label %load.ff.result + Value *FirstActive = + Builder.CreateExtractElement(Mask, uint64_t(0ull), Twine("first.active")); + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(FirstActive, CI, + /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DTU); + + // If the first mask lane was active, then we want a real load of one element + // into the first element of a vector, with the rest being poison. + // + // load.ff.first.lane: + // %ld.first = load ty, ptr %Ptr + // %lane = insertelement poison, ty %ld.first, i64 0 + // br label %load.ff.result + BasicBlock *ThenBlock = ThenTerm->getParent(); + ThenBlock->setName("load.ff.first.lane"); + Builder.SetInsertPoint(ThenBlock->getTerminator()); + LoadInst *Load = Builder.CreateAlignedLoad(ScalarTy, Ptr, AlignVal); + Value *OneLaneData = + Builder.CreateInsertElement(PoisonData, Load, uint64_t(0ull)); + Value *OneLaneMask = Builder.CreateInsertElement( + EmptyMask, Constant::getAllOnesValue(MaskTy->getElementType()), + uint64_t(0ull)); + + // Now we just select between the two based on the check of the first lane + // + // load.ff.result: + // %data.res = phi [ poison, %orig ], [ %lane, %load.ff.first.lane ] + // %mask.res = phi [ false, %orig ], [ , %ld.ff... ] + // %ins = insertvalue { , } poison, %data.res, 0 + // %first.lane.only = insertvalue { , } %ins, ...,1 + // ... replace all intrinsic uses with %first.lane.only + Builder.SetInsertPoint(CI); + PHINode *ResData = Builder.CreatePHI(DataTy, 2); + ResData->addIncoming(PoisonData, IfBlock); + ResData->addIncoming(OneLaneData, ThenBlock); + PHINode *ResMask = Builder.CreatePHI(MaskTy, 2); + ResMask->addIncoming(EmptyMask, IfBlock); + ResMask->addIncoming(OneLaneMask, ThenBlock); + + Value *Result = PoisonValue::get(RetTy); + Result = Builder.CreateInsertValue(Result, ResData, 0ul); + Result = Builder.CreateInsertValue(Result, ResMask, 1ul); + if (CI->hasName()) + Result->setName(CI->getName() + ".first.lane.only"); + CI->getParent()->setName("load.ff.result"); + CI->replaceAllUsesWith(Result); + CI->eraseFromParent(); + ModifiedDT = true; +} + static bool runImpl(Function &F, const TargetTransformInfo &TTI, DominatorTree *DT) { std::optional DTU; @@ -1110,11 +1189,18 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, DomTreeUpdater *DTU) { IntrinsicInst *II = dyn_cast(CI); if (II) { - // The scalarization code below does not work for scalable vectors. + // The scalarization code below does not work for scalable vectors, except + // for first faulting loads, which only need to deal with the first element. if (isa(II->getType()) || - any_of(II->args(), - [](Value *V) { return isa(V->getType()); })) + any_of(II->args(), [](Value *V) { + return isa(V->getType()); + })) { + if (II->getIntrinsicID() == Intrinsic::masked_load_ff) { + scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT); + return true; + } return false; + } switch (II->getIntrinsicID()) { default: break; @@ -1185,6 +1271,10 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, scalarizeMaskedCompressStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; + case Intrinsic::masked_load_ff: { + scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT); + return true; + } } } diff --git a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll new file mode 100644 index 0000000000000..0b8f831480f94 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O3 -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefix=NEON +; RUN: llc -O3 -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefix=SVE +; RUN: llc -O3 -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefix=SME_STREAMING + +define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { +; NEON-LABEL: load_ff_v4i32: +; NEON: // %bb.0: +; NEON-NEXT: // kill: def $d0 killed $d0 def $q0 +; NEON-NEXT: umov w8, v0.h[0] +; NEON-NEXT: tbz w8, #0, .LBB0_2 +; NEON-NEXT: // %bb.1: // %load.ff.first.lane +; NEON-NEXT: mov w8, #1 // =0x1 +; NEON-NEXT: ldr s0, [x0] +; NEON-NEXT: fmov d1, x8 +; NEON-NEXT: // kill: def $d1 killed $d1 killed $q1 +; NEON-NEXT: ret +; NEON-NEXT: .LBB0_2: +; NEON-NEXT: movi v1.2d, #0000000000000000 +; NEON-NEXT: // implicit-def: $q0 +; NEON-NEXT: // kill: def $d1 killed $d1 killed $q1 +; NEON-NEXT: ret +; +; SVE-LABEL: load_ff_v4i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; SVE-NEXT: umov w8, v0.h[0] +; SVE-NEXT: tbz w8, #0, .LBB0_2 +; SVE-NEXT: // %bb.1: // %load.ff.first.lane +; SVE-NEXT: mov w8, #1 // =0x1 +; SVE-NEXT: ldr s0, [x0] +; SVE-NEXT: fmov d1, x8 +; SVE-NEXT: // kill: def $d1 killed $d1 killed $q1 +; SVE-NEXT: ret +; SVE-NEXT: .LBB0_2: +; SVE-NEXT: movi v1.2d, #0000000000000000 +; SVE-NEXT: // implicit-def: $q0 +; SVE-NEXT: // kill: def $d1 killed $d1 killed $q1 +; SVE-NEXT: ret +; +; SME_STREAMING-LABEL: load_ff_v4i32: +; SME_STREAMING: // %bb.0: +; SME_STREAMING-NEXT: fmov w8, s0 +; SME_STREAMING-NEXT: tbz w8, #0, .LBB0_2 +; SME_STREAMING-NEXT: // %bb.1: // %load.ff.first.lane +; SME_STREAMING-NEXT: ptrue p0.s +; SME_STREAMING-NEXT: adrp x8, .LCPI0_1 +; SME_STREAMING-NEXT: ldr d1, [x8, :lo12:.LCPI0_1] +; SME_STREAMING-NEXT: ld1rw { z0.s }, p0/z, [x0] +; SME_STREAMING-NEXT: ret +; SME_STREAMING-NEXT: .LBB0_2: +; SME_STREAMING-NEXT: mov z1.h, #0 // =0x0 +; SME_STREAMING-NEXT: adrp x8, .LCPI0_0 +; SME_STREAMING-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] +; SME_STREAMING-NEXT: ret + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <4 x i1> %mask) + ret { <4 x i32>, <4 x i1> } %res +} + +define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_fully_aligned(ptr %p) { +; NEON-LABEL: load_ff_v2f64_all_true_fully_aligned: +; NEON: // %bb.0: // %load.ff.first.lane +; NEON-NEXT: mov w8, #1 // =0x1 +; NEON-NEXT: ldr d0, [x0] +; NEON-NEXT: fmov d1, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: load_ff_v2f64_all_true_fully_aligned: +; SVE: // %bb.0: // %load.ff.first.lane +; SVE-NEXT: ldr d0, [x0] +; SVE-NEXT: index z1.s, #1, #-1 +; SVE-NEXT: // kill: def $d1 killed $d1 killed $z1 +; SVE-NEXT: ret +; +; SME_STREAMING-LABEL: load_ff_v2f64_all_true_fully_aligned: +; SME_STREAMING: // %bb.0: // %load.ff.first.lane +; SME_STREAMING-NEXT: ptrue p0.d +; SME_STREAMING-NEXT: index z1.s, #1, #-1 +; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] +; SME_STREAMING-NEXT: ret + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <2 x i1> ) + ret { <2 x double>, <2 x i1> } %res +} + +define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_partially_aligned(ptr %p) { +; NEON-LABEL: load_ff_v2f64_all_true_partially_aligned: +; NEON: // %bb.0: // %load.ff.first.lane +; NEON-NEXT: mov w8, #1 // =0x1 +; NEON-NEXT: ldr d0, [x0] +; NEON-NEXT: fmov d1, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: load_ff_v2f64_all_true_partially_aligned: +; SVE: // %bb.0: // %load.ff.first.lane +; SVE-NEXT: ldr d0, [x0] +; SVE-NEXT: index z1.s, #1, #-1 +; SVE-NEXT: // kill: def $d1 killed $d1 killed $z1 +; SVE-NEXT: ret +; +; SME_STREAMING-LABEL: load_ff_v2f64_all_true_partially_aligned: +; SME_STREAMING: // %bb.0: // %load.ff.first.lane +; SME_STREAMING-NEXT: ptrue p0.d +; SME_STREAMING-NEXT: index z1.s, #1, #-1 +; SME_STREAMING-NEXT: ld1rd { z0.d }, p0/z, [x0] +; SME_STREAMING-NEXT: ret + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 8 %p, <2 x i1> ) + ret { <2 x double>, <2 x i1> } %res +} diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll new file mode 100644 index 0000000000000..3c74ab15f8e23 --- /dev/null +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -p scalarize-masked-mem-intrin -S < %s | FileCheck %s + +define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) { +; CHECK-LABEL: define { <4 x i32>, <4 x i1> } @load_ff_v4i32( +; CHECK-SAME: ptr [[P:%.*]], <4 x i1> [[MASK:%.*]]) { +; CHECK-NEXT: [[FIRST_ACTIVE:%.*]] = extractelement <4 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[FIRST_ACTIVE]], label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i1> [ zeroinitializer, [[TMP0]] ], [ , %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[TMP3]], 0 +; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[TMP5]], <4 x i1> [[TMP4]], 1 +; CHECK-NEXT: ret { <4 x i32>, <4 x i1> } [[RES_FIRST_LANE_ONLY]] +; + %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <4 x i1> %mask) + ret { <4 x i32>, <4 x i1> } %res +} + +;; We can 'scalarize' first faulting loads for scalable vectors, since we only +;; need to insert a single element into the start of a poison splat vector. +define { , } @load_ff_nxv4i32(ptr %p, %mask) { +; CHECK-LABEL: define { , } @load_ff_nxv4i32( +; CHECK-SAME: ptr [[P:%.*]], [[MASK:%.*]]) { +; CHECK-NEXT: [[FIRST_ACTIVE:%.*]] = extractelement [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[FIRST_ACTIVE]], label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, [[TMP0]] ], [ insertelement ( zeroinitializer, i1 true, i64 0), %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { , } [[TMP5]], [[TMP4]], 1 +; CHECK-NEXT: ret { , } [[RES]] +; + %res = call { , } @llvm.masked.load.ff(ptr align 16 %p, %mask) + ret { , } %res +} + +define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true(ptr %p) { +; CHECK-LABEL: define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: br i1 true, label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[P]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x double> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i1> [ zeroinitializer, [[TMP0]] ], [ , %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <2 x double>, <2 x i1> } poison, <2 x double> [[TMP3]], 0 +; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <2 x double>, <2 x i1> } [[TMP5]], <2 x i1> [[TMP4]], 1 +; CHECK-NEXT: ret { <2 x double>, <2 x i1> } [[RES_FIRST_LANE_ONLY]] +; + %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <2 x i1> ) + ret { <2 x double>, <2 x i1> } %res +} + +define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false(ptr %p) { +; CHECK-LABEL: define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: br i1 false, label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]] +; CHECK: [[LOAD_FF_FIRST_LANE]]: +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[P]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP1]], i64 0 +; CHECK-NEXT: br label %[[LOAD_FF_RESULT]] +; CHECK: [[LOAD_FF_RESULT]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <16 x i16> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <16 x i1> [ zeroinitializer, [[TMP0]] ], [ , %[[LOAD_FF_FIRST_LANE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <16 x i16>, <16 x i1> } poison, <16 x i16> [[TMP3]], 0 +; CHECK-NEXT: [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <16 x i16>, <16 x i1> } [[TMP5]], <16 x i1> [[TMP4]], 1 +; CHECK-NEXT: ret { <16 x i16>, <16 x i1> } [[RES_FIRST_LANE_ONLY]] +; + %res = call { <16 x i16>, <16 x i1> } @llvm.masked.load.ff(ptr align 32 %p, <16 x i1> zeroinitializer) + ret { <16 x i16>, <16 x i1> } %res +}