Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 59 additions & 51 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1452,8 +1452,25 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();

// Entry 0: Load from buffer.
// Don't set an offset, since the pointer value always represents the
// base of the buffer.
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
Info.ptrVal = CI.getArgOperand(1);
Info.flags &= ~MachineMemOperand::MOStore;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Info.flags |= MachineMemOperand::MOLoad; ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The initial Info.flags sets both Load and Store.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just assign store then? Assume both is a conservative default, but I don't expect targets to be trying to read information from the default

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are other flags in play here, including ones based on metadata that are setup in the beginning.

I was trying to keep the change minimal, but I agree that the code is confusing as-is. That said, looking into cleaning it up has uncovered a whole bunch of weirdness and inconsistencies, so I'm going to follow up with a separate change to refactor this.

Infos.push_back(Info);

// Entry 1: Store to LDS.
// Instruction offset is applied, and an additional per-lane offset
// which we simulate using a larger memory type.
Info.memVT = EVT::getIntegerVT(
CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
->getZExtValue();
Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
Info.flags &= ~MachineMemOperand::MOLoad;
Info.flags |= MachineMemOperand::MOStore;
Infos.push_back(Info);
return;
}
Expand Down Expand Up @@ -1646,35 +1663,65 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
// Entry 0: Load from source (global/flat).
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
Info.ptrVal = CI.getArgOperand(1);
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Info.ptrVal = CI.getArgOperand(0); // Global pointer
Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
Info.flags |= MachineMemOperand::MOLoad;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Info.flags &= ~MachineMemOperand::MOStore;

or is it unnecessary?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The initial Info.flags doesn't set either Load or Store.

Infos.push_back(Info);

// Entry 1: Store to LDS (same offset).
Info.flags &= ~MachineMemOperand::MOLoad;
Info.flags |= MachineMemOperand::MOStore;
Info.ptrVal = CI.getArgOperand(1); // LDS pointer
Infos.push_back(Info);
return;
}
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
// Entry 0: Load from LDS.
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
Info.ptrVal = CI.getArgOperand(0);
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
Info.ptrVal = CI.getArgOperand(1); // LDS pointer
Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
Info.flags |= MachineMemOperand::MOLoad;
Infos.push_back(Info);

// Entry 1: Store to global (same offset).
Info.flags &= ~MachineMemOperand::MOLoad;
Info.flags |= MachineMemOperand::MOStore;
Info.ptrVal = CI.getArgOperand(0); // Global pointer
Infos.push_back(Info);
return;
}
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
Info.ptrVal = CI.getArgOperand(1);
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;

// Entry 0: Load from source (global/flat).
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
Info.ptrVal = CI.getArgOperand(0); // Source pointer
Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
Info.flags |= MachineMemOperand::MOLoad;
if (IsVolatile)
Info.flags |= MachineMemOperand::MOVolatile;
Infos.push_back(Info);

// Entry 1: Store to LDS.
// Same offset from the instruction, but an additional per-lane offset is
// added. Represent that using a wider memory type.
Info.memVT = EVT::getIntegerVT(CI.getContext(),
Width * 8 * Subtarget->getWavefrontSize());
Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
Info.flags &= ~MachineMemOperand::MOLoad;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above, just assign the correct values instead of trying to read whatever happened to be the default

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same note as above.

Info.flags |= MachineMemOperand::MOStore;
Infos.push_back(Info);
return;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
Expand Down Expand Up @@ -11345,7 +11392,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
unsigned IntrinsicID = Op.getConstantOperandVal(1);
MachineFunction &MF = DAG.getMachineFunction();

switch (IntrinsicID) {
case Intrinsic::amdgcn_exp_compr: {
Expand Down Expand Up @@ -11620,29 +11666,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(1)); // Glue

auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
// Don't set the offset value here because the pointer points to the base of
// the buffer.
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();

MachinePointerInfo StorePtrI = LoadPtrI;
LoadPtrI.V = PoisonValue::get(
PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;

auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
LoadMMO =
MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());

MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());

auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
DAG.setNodeMemRefs(Load, M->memoperands());

return SDValue(Load, 0);
}
Expand Down Expand Up @@ -11724,25 +11749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(M0Val.getValue(1)); // Glue

auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = Op->getConstantOperandVal(5);
MachinePointerInfo StorePtrI = LoadPtrI;
LoadPtrI.V = PoisonValue::get(
PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huzzah! It's finally gone!

auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
LoadMMO =
MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
LoadMMO->getAAInfo());

auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
DAG.setNodeMemRefs(Load, M->memoperands());

return SDValue(Load, 0);
}
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,8 @@ main_body:
define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: buffer_load_lds_dword_volatile:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
Expand All @@ -278,9 +277,8 @@ define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture
;
; GFX942-LABEL: buffer_load_lds_dword_volatile:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a nice side-effect of this

; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,18 +340,16 @@ define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gpt
define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: s_mov_b32 m0, s5
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds
; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
; GFX942: ; %bb.0:
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: s_mov_b32 m0, s5
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_add_u32_e32 v0, s4, v0
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds
; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
; GFX942-NEXT: s_endpgm
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,14 @@ define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr add
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
; CHECK-NEXT: s_add_u32 s4, s0, 64
; CHECK-NEXT: s_addc_u32 s5, s1, 0
; CHECK-NEXT: s_mov_b32 m0, s2
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4
; CHECK-NEXT: s_load_dword s4, s[0:1], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: v_mov_b32_e32 v3, s6
; CHECK-NEXT: global_store_dword v0, v3, s[0:1] offset:64
; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4
; CHECK-NEXT: ; sched_barrier mask(0x00000000)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: v_mov_b32_e32 v2, s3
Expand Down
Loading