[AMDGPU] Return two MMOs for load-to-lds and store-from-lds intrinsics#175845
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Nicolai Hähnle (nhaehnle) ChangesAccurately represent both the load and the store part of those The test changes seem to be mostly fairly insignificant changes caused by Stack:
Full diff: https://github.com/llvm/llvm-project/pull/175845.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cc239752cd453..5224f2f093755 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1440,8 +1440,23 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+
+ // Entry 0: Load from buffer.
+ // Don't set an offset, since the pointer value always represents the
+ // base of the buffer.
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
- Info.ptrVal = CI.getArgOperand(1);
+ Info.flags &= ~MachineMemOperand::MOStore;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to LDS.
+ // Instruction offset is applied, and an additional per-lane offset
+ // which we simulate using a larger memory type.
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64);
+ Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))->getZExtValue();
+ Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
Infos.push_back(Info);
return;
}
@@ -1634,10 +1649,18 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+ // Entry 0: Load from source (global/flat).
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
- Info.ptrVal = CI.getArgOperand(1);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(0); // Global pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to LDS (same offset).
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(1); // LDS pointer
Infos.push_back(Info);
return;
}
@@ -1645,24 +1668,45 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ // Entry 0: Load from LDS.
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
- Info.ptrVal = CI.getArgOperand(0);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(1); // LDS pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ Infos.push_back(Info);
+
+ // Entry 1: Store to global (same offset).
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Info.ptrVal = CI.getArgOperand(0); // Global pointer
Infos.push_back(Info);
return;
}
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
- Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
- Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
- Info.ptrVal = CI.getArgOperand(1);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
- if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+ bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
+
+ // Entry 0: Load from source (global/flat).
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ Info.ptrVal = CI.getArgOperand(0); // Source pointer
+ Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
+ Info.flags |= MachineMemOperand::MOLoad;
+ if (IsVolatile)
Info.flags |= MachineMemOperand::MOVolatile;
Infos.push_back(Info);
+
+ // Entry 1: Store to LDS.
+ // Same offset from the instruction, but an additional per-lane offset is
+ // added. Represent that using a wider memory type.
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64);
+ Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+ Info.flags &= ~MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOStore;
+ Infos.push_back(Info);
return;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
@@ -11184,7 +11228,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
unsigned IntrinsicID = Op.getConstantOperandVal(1);
- MachineFunction &MF = DAG.getMachineFunction();
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp_compr: {
@@ -11459,29 +11502,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
- MachineMemOperand *LoadMMO = M->getMemOperand();
- // Don't set the offset value here because the pointer points to the base of
- // the buffer.
- MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-
- MachinePointerInfo StorePtrI = LoadPtrI;
- LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
- LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
- StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
-
- auto F = LoadMMO->getFlags() &
- ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO =
- MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
- MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
- StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
- DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+ DAG.setNodeMemRefs(Load, M->memoperands());
return SDValue(Load, 0);
}
@@ -11563,25 +11585,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
- MachineMemOperand *LoadMMO = M->getMemOperand();
- MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
- LoadPtrI.Offset = Op->getConstantOperandVal(5);
- MachinePointerInfo StorePtrI = LoadPtrI;
- LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
- LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
- StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
- auto F = LoadMMO->getFlags() &
- ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO =
- MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
- LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
- MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
- StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
- LoadMMO->getAAInfo());
-
auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
- DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+ DAG.setNodeMemRefs(Load, M->memoperands());
return SDValue(Load, 0);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 352af044b0a6d..f66ad928d261d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -267,9 +267,8 @@ main_body:
define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_dword_volatile:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
@@ -278,9 +277,8 @@ define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture
;
; GFX942-LABEL: buffer_load_lds_dword_volatile:
; GFX942: ; %bb.0: ; %main_body
-; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
index 0b40c81af414d..14d2e4ca5f2c3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -340,18 +340,16 @@ define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gpt
define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
; GFX90A: ; %bb.0:
-; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
; GFX942: ; %bb.0:
-; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX942-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
index a00aca34252b1..74fddfe290818 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
@@ -11,15 +11,14 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
; CHECK-NEXT: s_add_u32 s4, s0, 64
; CHECK-NEXT: s_addc_u32 s5, s1, 0
; CHECK-NEXT: s_mov_b32 m0, s2
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4
-; CHECK-NEXT: s_load_dword s4, s[0:1], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s6
; CHECK-NEXT: global_store_dword v0, v3, s[0:1] offset:64
+; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4
; CHECK-NEXT: ; sched_barrier mask(0x00000000)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: v_mov_b32_e32 v2, s3
|
48931ab to
963ecef
Compare
1d72df0 to
b585dcf
Compare
krzysz00
left a comment
There was a problem hiding this comment.
Thanks for tackling this - one note
| // Entry 1: Store to LDS. | ||
| // Instruction offset is applied, and an additional per-lane offset | ||
| // which we simulate using a larger memory type. | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8 * 64); |
There was a problem hiding this comment.
This exists on gfx10, where the width is 32?
There was a problem hiding this comment.
Ugh, you're right. To be precise, it may be either 32 or 64.
| LoadPtrI.V = PoisonValue::get( | ||
| PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); | ||
| LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; | ||
| StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; |
There was a problem hiding this comment.
Huzzah! It's finally gone!
| ; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 | ||
| ; GFX942-NEXT: s_mov_b32 m0, s5 | ||
| ; GFX942-NEXT: s_nop 0 | ||
| ; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 |
There was a problem hiding this comment.
That's a nice side-effect of this
| Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; | ||
| Info.ptrVal = CI.getArgOperand(0); // Global pointer | ||
| Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); | ||
| Info.flags |= MachineMemOperand::MOLoad; |
There was a problem hiding this comment.
Info.flags &= ~MachineMemOperand::MOStore;
or is it unnecessary?
There was a problem hiding this comment.
The initial Info.flags doesn't set either Load or Store.
| // base of the buffer. | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); | ||
| Info.ptrVal = CI.getArgOperand(1); | ||
| Info.flags &= ~MachineMemOperand::MOStore; |
There was a problem hiding this comment.
Info.flags |= MachineMemOperand::MOLoad; ?
There was a problem hiding this comment.
The initial Info.flags sets both Load and Store.
There was a problem hiding this comment.
Why not just assign store then? Assume both is a conservative default, but I don't expect targets to be trying to read information from the default
There was a problem hiding this comment.
There are other flags in play here, including ones based on metadata that are setup in the beginning.
I was trying to keep the change minimal, but I agree that the code is confusing as-is. That said, looking into cleaning it up has uncovered a whole bunch of weirdness and inconsistencies, so I'm going to follow up with a separate change to refactor this.
963ecef to
1c445d8
Compare
b585dcf to
4fbc3c6
Compare
| Info.memVT = | ||
| EVT::getIntegerVT(CI.getContext(), Width * 8 * ST.getWavefrontSize()); | ||
| Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer | ||
| Info.flags &= ~MachineMemOperand::MOLoad; |
There was a problem hiding this comment.
Same as above, just assign the correct values instead of trying to read whatever happened to be the default
There was a problem hiding this comment.
Same note as above.
|
Re assigning vs notification - I think the interesting case is of the cache policy calls for volatile - we don't want to remove that and do want to make sure to add it ... Heck, can we get a test for a volatile global load to LDS (cache policy 1 << 31) |
4fbc3c6 to
8a444c4
Compare
1c445d8 to
940398a
Compare
Accurately represent both the load and the store part of those intrinsics. The test changes seem to be mostly fairly insignificant changes caused by subtly different scheduler behavior. commit-id:0269189c
8a444c4 to
8050ace
Compare
What do you have in mind? This stuff is notoriously difficult to test. I tried to make this change as uninvasive as possible, but I agree that the current flow of the flags is unfortunate. I'll follow up with a change to clean this up. It's a bit non-trivial since looking over it revealed some weirdness and inconsistencies that we'd better fix at the same time. |
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
llvm#175845) Accurately represent both the load and the store part of those intrinsics. The test changes seem to be mostly fairly insignificant changes caused by subtly different scheduler behavior.
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
…ntrinsics (llvm#175845)" This reverts commit 3e1e86e.
Accurately represent both the load and the store part of those
intrinsics.
The test changes seem to be mostly fairly insignificant changes caused by
subtly different scheduler behavior.
Stack: