diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 945e11be31278..b8ece53328b3c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -651,6 +651,16 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts") @@ -670,9 +680,6 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1 TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts") -TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts") - TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst") TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl index e3fe31ff7dd75..ccc05f0aa5af3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl @@ -2,6 +2,89 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_load_async_to_lds_b8( global char* gaddr, local char* laddr) +{ + __builtin_amdgcn_global_load_async_to_lds_b8(gaddr, laddr, 16, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_load_async_to_lds_b32(global int* gaddr, local int* laddr) +{ + __builtin_amdgcn_global_load_async_to_lds_b32(gaddr, laddr, 16, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr) +{ + __builtin_amdgcn_global_load_async_to_lds_b64(gaddr, laddr, 16, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_load_async_to_lds_b128( global v4i* gaddr, local v4i* laddr) +{ + __builtin_amdgcn_global_load_async_to_lds_b128(gaddr, laddr, 16, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b8( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_store_async_from_lds_b8(global char* gaddr, local char* laddr) +{ + __builtin_amdgcn_global_store_async_from_lds_b8(gaddr, laddr, 16, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_store_async_from_lds_b32(global int* gaddr, local int* laddr) +{ + __builtin_amdgcn_global_store_async_from_lds_b32(gaddr, laddr, 16, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_store_async_from_lds_b64(global v2i* gaddr, local v2i* laddr) +{ + __builtin_amdgcn_global_store_async_from_lds_b64(gaddr, laddr, 16, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_global_store_async_from_lds_b128(global v4i* gaddr, local v4i* laddr) +{ + __builtin_amdgcn_global_store_async_from_lds_b128(gaddr, laddr, 16, 0); +} + // CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_async_barrier_arrive_b64( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR:%.*]]) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 3a7db6d599551..1da4e36c5a743 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3643,6 +3643,50 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< [IntrNoMem, IntrSpeculatable] >; +class AMDGPUAsyncGlobalLoadToLDS : Intrinsic < + [], + [global_ptr_ty, // Base global pointer to load from + local_ptr_ty, // LDS base pointer to store to. + llvm_i32_ty, // offset + llvm_i32_ty], // gfx12+ cachepolicy: + // bits [0-2] = th + // bits [3-4] = scope + [IntrInaccessibleMemOrArgMemOnly, ReadOnly>, WriteOnly>, NoCapture>, + NoCapture>, ImmArg>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] +>; + +class AMDGPUAsyncGlobalStoreFromLDS : Intrinsic < + [], + [global_ptr_ty, // Base global pointer to store to + local_ptr_ty, // LDS base pointer to load from + llvm_i32_ty, // offset + llvm_i32_ty], // gfx12+ cachepolicy: + // bits [0-2] = th + // bits [3-4] = scope + [IntrInaccessibleMemOrArgMemOnly, WriteOnly>, ReadOnly>, NoCapture>, + NoCapture>, ImmArg>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] +>; + +def int_amdgcn_global_load_async_to_lds_b8 : + ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b8">, AMDGPUAsyncGlobalLoadToLDS; +def int_amdgcn_global_load_async_to_lds_b32 : + ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b32">, AMDGPUAsyncGlobalLoadToLDS; +def int_amdgcn_global_load_async_to_lds_b64 : + ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b64">, AMDGPUAsyncGlobalLoadToLDS; +def int_amdgcn_global_load_async_to_lds_b128 : + ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b128">, AMDGPUAsyncGlobalLoadToLDS; + +def int_amdgcn_global_store_async_from_lds_b8 : + ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b8">, AMDGPUAsyncGlobalStoreFromLDS; +def int_amdgcn_global_store_async_from_lds_b32 : + ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b32">, AMDGPUAsyncGlobalStoreFromLDS; +def int_amdgcn_global_store_async_from_lds_b64 : + ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b64">, AMDGPUAsyncGlobalStoreFromLDS; +def int_amdgcn_global_store_async_from_lds_b128 : + ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b128">, AMDGPUAsyncGlobalStoreFromLDS; + // WMMA intrinsics. class AMDGPUWmmaIntrinsicModsAB : Intrinsic< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index c01e5d3ff93c2..992572f17e5b9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -143,6 +143,9 @@ def gi_global_saddr_cpol : def gi_global_saddr_glc : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_global_saddr_no_ioffset : + GIComplexOperandMatcher, + GIComplexPatternEquiv; def gi_mubuf_scratch_offset : GIComplexOperandMatcher, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index dfaa1450e5c61..3d7e678d2e54f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2049,6 +2049,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 5636d896f2e7c..983f1aa8fab86 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -174,6 +174,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 266dee183229e..04773c9c7b773 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5788,6 +5788,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index fe9743d0a4b99..140e753bf976a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -264,6 +264,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { selectGlobalSAddrCPol(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c5a1d9e005e15..306443d25a74f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5364,6 +5364,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0f172e0ddee56..8ede9caead8bc 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,6 +11,7 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern; def ScratchOffset : ComplexPattern; + def GlobalSAddrNoIOffset : ComplexPattern; def GlobalSAddr : ComplexPattern; def GlobalSAddrGLC : ComplexPattern; def GlobalSAddrCPol : ComplexPattern; @@ -1361,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 ; +class FlatLoadLDSSignedPat : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + +class FlatStoreLDSSignedPat : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $vaddr, $dsaddr, $offset, $cpol) +>; + +class GlobalStoreLDSSaddrPat : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $saddr, $voffset, $dsaddr, $offset, $cpol) +>; + class GlobalLoadSaddrPat_D16_t16 : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), (inst $saddr, $voffset, $offset, $cpol) @@ -1571,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 ; +multiclass GlobalLoadLDSPats { + def : FlatLoadLDSSignedPat { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat(!cast(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + +multiclass GlobalStoreLDSPats { + def : FlatStoreLDSSignedPat { + let AddedComplexity = 10; + } + + def : GlobalStoreLDSSaddrPat(!cast(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats { def : FlatLoadSignedPat { let AddedComplexity = 10; @@ -2137,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in { defm : GlobalFLATLoadPats_CPOL ; } // End SubtargetPredicate = isGFX125xOnly +let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalLoadLDSPats ; + defm : GlobalLoadLDSPats ; + defm : GlobalLoadLDSPats ; + defm : GlobalLoadLDSPats ; + + defm : GlobalStoreLDSPats ; + defm : GlobalStoreLDSPats ; + defm : GlobalStoreLDSPats ; + defm : GlobalStoreLDSPats ; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9017f4f26f835..fbaf9bc452790 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1256,6 +1256,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } +static unsigned getIntrMemWidth(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + return 8; + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + return 32; + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + return 64; + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + return 128; + default: + llvm_unreachable("Unknown width"); + } +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1527,6 +1546,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; @@ -1623,10 +1662,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll new file mode 100644 index 0000000000000..dd679101047ea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) +declare void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) +declare void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) +declare void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) + +define amdgpu_ps void @global_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_load_async_to_lds_b8_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_load_async_to_lds_b8_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_load_async_to_lds_b8_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_load_async_to_lds_b8 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_load_async_to_lds_b32_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_load_async_to_lds_b32_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 10) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_load_async_to_lds_b32_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 22) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_load_async_to_lds_b64_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_load_async_to_lds_b128_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_load_async_to_lds_b128_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 27) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_load_async_to_lds_b128_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_load_async_to_lds_b128 v0, v1, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { +; GFX1250-LABEL: global_load_async_to_lds_b32_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { +; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { +; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-SDAG-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll new file mode 100644 index 0000000000000..fd35313802558 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) +declare void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) +declare void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) +declare void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol) + +define amdgpu_ps void @global_store_async_from_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_store_async_from_lds_b8_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_store_async_from_lds_b8 v[0:1], v2, off offset:16 th:TH_STORE_NT +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_store_async_from_lds_b8_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_store_async_from_lds_b8 v[0:1], v2, off offset:16 th:TH_STORE_NT +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_store_async_from_lds_b8_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_store_async_from_lds_b8 v1, v0, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_store_async_from_lds_b32: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_store_async_from_lds_b32 v[0:1], v2, off offset:16 th:TH_STORE_HT scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_store_async_from_lds_b32: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_store_async_from_lds_b32 v[0:1], v2, off offset:16 th:TH_STORE_HT scope:SCOPE_SE +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 10) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_store_async_from_lds_b32_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_store_async_from_lds_b64 v[0:1], v2, off offset:16 th:TH_STORE_NT_HT scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_store_async_from_lds_b64 v[0:1], v2, off offset:16 th:TH_STORE_NT_HT scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 22) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_store_async_from_lds_b64_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-SDAG-LABEL: global_store_async_from_lds_b128_vaddr: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] +; GFX1250-SDAG-NEXT: global_store_async_from_lds_b128 v[0:1], v2, off offset:16 th:TH_STORE_BYPASS scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_store_async_from_lds_b128_vaddr: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: global_store_async_from_lds_b128 v[0:1], v2, off offset:16 th:TH_STORE_BYPASS scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 27) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { +; GFX1250-LABEL: global_store_async_from_lds_b128_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 32 +; GFX1250-NEXT: global_store_async_from_lds_b128 v1, v0, s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4 + call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { +; GFX1250-LABEL: global_store_async_from_lds_b32_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { +; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i64, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_store_async_from_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { +; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-SDAG-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom + call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1) + ret void +}