-
Notifications
You must be signed in to change notification settings - Fork 17.3k
[AMDGPU] Return two MMOs for load-to-lds and store-from-lds intrinsics #175845
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1452,8 +1452,25 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, | |
| case Intrinsic::amdgcn_struct_buffer_load_lds: | ||
| case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { | ||
| unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); | ||
|
|
||
| // Entry 0: Load from buffer. | ||
| // Don't set an offset, since the pointer value always represents the | ||
| // base of the buffer. | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); | ||
| Info.ptrVal = CI.getArgOperand(1); | ||
| Info.flags &= ~MachineMemOperand::MOStore; | ||
| Infos.push_back(Info); | ||
|
|
||
| // Entry 1: Store to LDS. | ||
| // Instruction offset is applied, and an additional per-lane offset | ||
| // which we simulate using a larger memory type. | ||
| Info.memVT = EVT::getIntegerVT( | ||
| CI.getContext(), Width * 8 * Subtarget->getWavefrontSize()); | ||
| Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer | ||
| Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2)) | ||
| ->getZExtValue(); | ||
| Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS; | ||
| Info.flags &= ~MachineMemOperand::MOLoad; | ||
| Info.flags |= MachineMemOperand::MOStore; | ||
| Infos.push_back(Info); | ||
| return; | ||
| } | ||
|
|
@@ -1646,35 +1663,65 @@ void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, | |
| case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: | ||
| case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: | ||
| case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { | ||
| // Entry 0: Load from source (global/flat). | ||
| Info.opc = ISD::INTRINSIC_VOID; | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); | ||
| Info.ptrVal = CI.getArgOperand(1); | ||
| Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; | ||
| Info.ptrVal = CI.getArgOperand(0); // Global pointer | ||
| Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); | ||
| Info.flags |= MachineMemOperand::MOLoad; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Info.flags &= ~MachineMemOperand::MOStore; or is it unnecessary?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The initial Info.flags doesn't set either Load or Store. |
||
| Infos.push_back(Info); | ||
|
|
||
| // Entry 1: Store to LDS (same offset). | ||
| Info.flags &= ~MachineMemOperand::MOLoad; | ||
| Info.flags |= MachineMemOperand::MOStore; | ||
| Info.ptrVal = CI.getArgOperand(1); // LDS pointer | ||
| Infos.push_back(Info); | ||
| return; | ||
| } | ||
| case Intrinsic::amdgcn_global_store_async_from_lds_b8: | ||
| case Intrinsic::amdgcn_global_store_async_from_lds_b32: | ||
| case Intrinsic::amdgcn_global_store_async_from_lds_b64: | ||
| case Intrinsic::amdgcn_global_store_async_from_lds_b128: { | ||
| // Entry 0: Load from LDS. | ||
| Info.opc = ISD::INTRINSIC_VOID; | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); | ||
| Info.ptrVal = CI.getArgOperand(0); | ||
| Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; | ||
| Info.ptrVal = CI.getArgOperand(1); // LDS pointer | ||
| Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); | ||
| Info.flags |= MachineMemOperand::MOLoad; | ||
| Infos.push_back(Info); | ||
|
|
||
| // Entry 1: Store to global (same offset). | ||
| Info.flags &= ~MachineMemOperand::MOLoad; | ||
| Info.flags |= MachineMemOperand::MOStore; | ||
| Info.ptrVal = CI.getArgOperand(0); // Global pointer | ||
| Infos.push_back(Info); | ||
| return; | ||
| } | ||
| case Intrinsic::amdgcn_load_to_lds: | ||
| case Intrinsic::amdgcn_global_load_lds: { | ||
| Info.opc = ISD::INTRINSIC_VOID; | ||
| unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); | ||
| Info.ptrVal = CI.getArgOperand(1); | ||
| Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; | ||
| auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); | ||
| if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) | ||
| bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE; | ||
|
|
||
| // Entry 0: Load from source (global/flat). | ||
| Info.opc = ISD::INTRINSIC_VOID; | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); | ||
| Info.ptrVal = CI.getArgOperand(0); // Source pointer | ||
| Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue(); | ||
| Info.flags |= MachineMemOperand::MOLoad; | ||
| if (IsVolatile) | ||
| Info.flags |= MachineMemOperand::MOVolatile; | ||
| Infos.push_back(Info); | ||
|
|
||
| // Entry 1: Store to LDS. | ||
| // Same offset from the instruction, but an additional per-lane offset is | ||
| // added. Represent that using a wider memory type. | ||
| Info.memVT = EVT::getIntegerVT(CI.getContext(), | ||
| Width * 8 * Subtarget->getWavefrontSize()); | ||
| Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer | ||
| Info.flags &= ~MachineMemOperand::MOLoad; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above, just assign the correct values instead of trying to read whatever happened to be the default
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same note as above. |
||
| Info.flags |= MachineMemOperand::MOStore; | ||
| Infos.push_back(Info); | ||
| return; | ||
| } | ||
| case Intrinsic::amdgcn_ds_bvh_stack_rtn: | ||
|
|
@@ -11345,7 +11392,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, | |
| SDLoc DL(Op); | ||
| SDValue Chain = Op.getOperand(0); | ||
| unsigned IntrinsicID = Op.getConstantOperandVal(1); | ||
| MachineFunction &MF = DAG.getMachineFunction(); | ||
|
|
||
| switch (IntrinsicID) { | ||
| case Intrinsic::amdgcn_exp_compr: { | ||
|
|
@@ -11620,29 +11666,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, | |
| Ops.push_back(M0Val.getValue(1)); // Glue | ||
|
|
||
| auto *M = cast<MemSDNode>(Op); | ||
| MachineMemOperand *LoadMMO = M->getMemOperand(); | ||
| // Don't set the offset value here because the pointer points to the base of | ||
| // the buffer. | ||
| MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); | ||
|
|
||
| MachinePointerInfo StorePtrI = LoadPtrI; | ||
| LoadPtrI.V = PoisonValue::get( | ||
| PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE)); | ||
| LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; | ||
| StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; | ||
|
|
||
| auto F = LoadMMO->getFlags() & | ||
| ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); | ||
| LoadMMO = | ||
| MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, | ||
| LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); | ||
|
|
||
| MachineMemOperand *StoreMMO = MF.getMachineMemOperand( | ||
| StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), | ||
| LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); | ||
|
|
||
| auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); | ||
| DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); | ||
| DAG.setNodeMemRefs(Load, M->memoperands()); | ||
|
|
||
| return SDValue(Load, 0); | ||
| } | ||
|
|
@@ -11724,25 +11749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, | |
| Ops.push_back(M0Val.getValue(1)); // Glue | ||
|
|
||
| auto *M = cast<MemSDNode>(Op); | ||
| MachineMemOperand *LoadMMO = M->getMemOperand(); | ||
| MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); | ||
| LoadPtrI.Offset = Op->getConstantOperandVal(5); | ||
| MachinePointerInfo StorePtrI = LoadPtrI; | ||
| LoadPtrI.V = PoisonValue::get( | ||
| PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); | ||
| LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; | ||
| StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Huzzah! It's finally gone! |
||
| auto F = LoadMMO->getFlags() & | ||
| ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); | ||
| LoadMMO = | ||
| MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, | ||
| LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); | ||
| MachineMemOperand *StoreMMO = MF.getMachineMemOperand( | ||
| StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), | ||
| LoadMMO->getAAInfo()); | ||
|
|
||
| auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); | ||
| DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); | ||
| DAG.setNodeMemRefs(Load, M->memoperands()); | ||
|
|
||
| return SDValue(Load, 0); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -267,9 +267,8 @@ main_body: | |
| define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { | ||
| ; GFX90A-LABEL: buffer_load_lds_dword_volatile: | ||
| ; GFX90A: ; %bb.0: ; %main_body | ||
| ; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 | ||
| ; GFX90A-NEXT: s_mov_b32 m0, s5 | ||
| ; GFX90A-NEXT: s_nop 0 | ||
| ; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 | ||
| ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds | ||
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) | ||
| ; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds | ||
|
|
@@ -278,9 +277,8 @@ define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture | |
| ; | ||
| ; GFX942-LABEL: buffer_load_lds_dword_volatile: | ||
| ; GFX942: ; %bb.0: ; %main_body | ||
| ; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 | ||
| ; GFX942-NEXT: s_mov_b32 m0, s5 | ||
| ; GFX942-NEXT: s_nop 0 | ||
| ; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a nice side-effect of this |
||
| ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds | ||
| ; GFX942-NEXT: s_waitcnt vmcnt(0) | ||
| ; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Info.flags |= MachineMemOperand::MOLoad; ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The initial Info.flags sets both Load and Store.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just assign store then? Assume both is a conservative default, but I don't expect targets to be trying to read information from the default
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are other flags in play here, including ones based on metadata that are setup in the beginning.
I was trying to keep the change minimal, but I agree that the code is confusing as-is. That said, looking into cleaning it up has uncovered a whole bunch of weirdness and inconsistencies, so I'm going to follow up with a separate change to refactor this.