-
Notifications
You must be signed in to change notification settings - Fork 17.7k
[AMDGPU] Fix LDS address correction in promoteConstantOffsetToImm for async stores #180220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
cd1cb0c
e91f6ad
6882d9b
9964052
69cd857
dc37da6
a0c48c1
047d2fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2369,24 +2369,31 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base | |
| Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); | ||
| } | ||
|
|
||
| // Maintain the correct LDS address for async loads. | ||
| // It becomes incorrect when promoteConstantOffsetToImm | ||
| // adds an offset only meant for the src operand. | ||
| // Maintain the correct LDS address for async loads and stores. | ||
| // It becomes incorrect when promoteConstantOffsetToImm adds an offset only | ||
| // meant for the global address operand. For async loads the LDS address is in | ||
| // vdst. For async stores, the LDS address is in vdata. | ||
| void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI, | ||
| int32_t OffsetDiff) const { | ||
| if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0) | ||
| return; | ||
|
|
||
| Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); | ||
| Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst)); | ||
| MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | ||
| if (!LDSAddr) | ||
| LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); | ||
| if (!LDSAddr) | ||
| return; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this just be an assert? Unless there's another case I'm not aware of?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LGTM otherwise. |
||
|
|
||
| Register OldReg = LDSAddr->getReg(); | ||
| Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg)); | ||
| MachineBasicBlock &MBB = *MI.getParent(); | ||
| const DebugLoc &DL = MI.getDebugLoc(); | ||
| BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst) | ||
| .addReg(OldVDst) | ||
| BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg) | ||
| .addReg(OldReg) | ||
| .addImm(-OffsetDiff) | ||
| .addImm(0); | ||
|
|
||
| MI.getOperand(0).setReg(NewVDst); | ||
| LDSAddr->setReg(NewReg); | ||
| } | ||
|
|
||
| bool SILoadStoreOptimizer::promoteConstantOffsetToImm( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -109,3 +109,39 @@ body: | | |
| GLOBAL_LOAD_ASYNC_TO_LDS_B128 %6, killed %15, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) | ||
| S_ENDPGM 0 | ||
| ... | ||
|
|
||
| # Same as promote_async_load_offset above, but for async stores. The LDS address | ||
| # is in vdata instead of vdst, so this tests that updateAsyncLDSAddress corrects | ||
| # the right operand. | ||
| --- | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you call out the line with the corrected offset and why it has this value in a comment?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the review, can you have another look? Both tests are essentially the same as the existing ones for async_load, the intent is just to verify that we adjust the correct operands and we no longer crash. |
||
| name: promote_async_store_offset | ||
| machineFunctionInfo: | ||
| stackPtrOffsetReg: '$sgpr32' | ||
| frameOffsetReg: '$sgpr33' | ||
| tracksRegLiveness: true | ||
| body: | | ||
| bb.0.entry: | ||
| liveins: $vgpr0, $sgpr0_sgpr1, $ttmp7 | ||
| ; GFX1250-LABEL: name: promote_async_store_offset | ||
| ; GFX1250: liveins: $ttmp7, $vgpr0, $sgpr0_sgpr1 | ||
| ; GFX1250-NEXT: {{ $}} | ||
| ; GFX1250-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 8, $vgpr0, implicit $exec | ||
| ; GFX1250-NEXT: renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 $vgpr0, 512, 0, implicit $exec | ||
| ; GFX1250-NEXT: renamable $vgpr3, dead $sgpr_null = V_ADDC_U32_e64 0, killed $vgpr0, killed $vcc_lo, 0, implicit $exec | ||
| ; GFX1250-NEXT: renamable $vgpr1 = disjoint V_OR_B32_e32 0, killed $vgpr1, implicit $exec | ||
| ; GFX1250-NEXT: renamable $vgpr0 = V_ADD_U32_e32 256, $vgpr1, implicit $exec | ||
| ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 $vgpr2_vgpr3, killed $vgpr0, -256, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) | ||
| ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed $vgpr2_vgpr3, killed $vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) | ||
| %0:vgpr_32 = COPY $vgpr0 | ||
| %1:vgpr_32 = V_LSHLREV_B32_e64 8, %0, implicit $exec | ||
| %2:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec | ||
| %3:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec | ||
| %4:vgpr_32, %5:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 256, 0, implicit $exec | ||
| %6:vgpr_32, %7:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %5, 0, implicit $exec | ||
| %8:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %6, %subreg.sub1 | ||
| GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %8, killed %2, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) | ||
| %9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 512, 0, implicit $exec | ||
| %11:vgpr_32, %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %10, 0, implicit $exec | ||
| %13:vreg_64_align2 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 | ||
| GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %13, killed %3, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) | ||
| ... | ||
Uh oh!
There was an error while loading. Please reload this page.