Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,16 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts")

TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
Expand All @@ -670,9 +680,6 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")

TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts")

TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,89 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250

typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));

// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_load_async_to_lds_b8( global char* gaddr, local char* laddr)
{
__builtin_amdgcn_global_load_async_to_lds_b8(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b32(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_load_async_to_lds_b32(global int* gaddr, local int* laddr)
{
__builtin_amdgcn_global_load_async_to_lds_b32(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b64(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr)
{
__builtin_amdgcn_global_load_async_to_lds_b64(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b128(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_load_async_to_lds_b128( global v4i* gaddr, local v4i* laddr)
{
__builtin_amdgcn_global_load_async_to_lds_b128(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b8(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_store_async_from_lds_b8(global char* gaddr, local char* laddr)
{
__builtin_amdgcn_global_store_async_from_lds_b8(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b32(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_store_async_from_lds_b32(global int* gaddr, local int* laddr)
{
__builtin_amdgcn_global_store_async_from_lds_b32(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b64(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_store_async_from_lds_b64(global v2i* gaddr, local v2i* laddr)
{
__builtin_amdgcn_global_store_async_from_lds_b64(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b128(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_global_store_async_from_lds_b128(global v4i* gaddr, local v4i* laddr)
{
__builtin_amdgcn_global_store_async_from_lds_b128(gaddr, laddr, 16, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_async_barrier_arrive_b64(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR:%.*]])
Expand Down
44 changes: 44 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3643,6 +3643,50 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable]
>;

class AMDGPUAsyncGlobalLoadToLDS : Intrinsic <
[],
[global_ptr_ty, // Base global pointer to load from
local_ptr_ty, // LDS base pointer to store to.
llvm_i32_ty, // offset
llvm_i32_ty], // gfx12+ cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

class AMDGPUAsyncGlobalStoreFromLDS : Intrinsic <
[],
[global_ptr_ty, // Base global pointer to store to
local_ptr_ty, // LDS base pointer to load from
llvm_i32_ty, // offset
llvm_i32_ty], // gfx12+ cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

def int_amdgcn_global_load_async_to_lds_b8 :
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b8">, AMDGPUAsyncGlobalLoadToLDS;
def int_amdgcn_global_load_async_to_lds_b32 :
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b32">, AMDGPUAsyncGlobalLoadToLDS;
def int_amdgcn_global_load_async_to_lds_b64 :
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b64">, AMDGPUAsyncGlobalLoadToLDS;
def int_amdgcn_global_load_async_to_lds_b128 :
ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b128">, AMDGPUAsyncGlobalLoadToLDS;

def int_amdgcn_global_store_async_from_lds_b8 :
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b8">, AMDGPUAsyncGlobalStoreFromLDS;
def int_amdgcn_global_store_async_from_lds_b32 :
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b32">, AMDGPUAsyncGlobalStoreFromLDS;
def int_amdgcn_global_store_async_from_lds_b64 :
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b64">, AMDGPUAsyncGlobalStoreFromLDS;
def int_amdgcn_global_store_async_from_lds_b128 :
ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b128">, AMDGPUAsyncGlobalStoreFromLDS;

// WMMA intrinsics.
class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
Intrinsic<
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ def gi_global_saddr_cpol :
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
def gi_global_saddr_no_ioffset :
GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
GIComplexPatternEquiv<GlobalSAddrNoIOffset>;

def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2049,6 +2049,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
return true;
}

bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
SDValue &SAddr,
SDValue &VOffset,
SDValue &CPol) const {
bool ScaleOffset;
SDValue DummyOffset;
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
false))
return false;

// We are assuming CPol is always the last operand of the intrinsic.
auto PassedCPol =
N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
CPol = CurDAG->getTargetConstant(
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
return true;
}

static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5788,6 +5788,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
MachineOperand &Root) const {
const MachineInstr &I = *Root.getParent();

// We are assuming CPol is always the last operand of the intrinsic.
auto PassedCPol =
I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
return selectGlobalSAddr(Root, PassedCPol, false);
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5364,6 +5364,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b128:
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
Expand Down
53 changes: 53 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;

def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
Expand Down Expand Up @@ -1361,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;

class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
(node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
(inst $dsaddr, $vaddr, $offset, $cpol)
>;

class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
(node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
(inst $dsaddr, $saddr, $voffset, $offset, $cpol)
>;

class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
(node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
(inst $vaddr, $dsaddr, $offset, $cpol)
>;

class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
(node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
(inst $saddr, $voffset, $dsaddr, $offset, $cpol)
>;

class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
Expand Down Expand Up @@ -1571,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $saddr, $offset, $cpol)
>;

multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
def : FlatLoadLDSSignedPat <inst, node> {
let AddedComplexity = 10;
}

def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
let AddedComplexity = 11;
}
}

multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
def : FlatStoreLDSSignedPat <inst, node> {
let AddedComplexity = 10;
}

def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
let AddedComplexity = 11;
}
}

multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
Expand Down Expand Up @@ -2137,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in {
defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
} // End SubtargetPredicate = isGFX125xOnly

let OtherPredicates = [isGFX1250Plus] in {
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>;

defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>;
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>;
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>;
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
}

let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
Expand Down
Loading