Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2369,24 +2369,31 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
}

// Maintain the correct LDS address for async loads.
// It becomes incorrect when promoteConstantOffsetToImm
// adds an offset only meant for the src operand.
// Maintain the correct LDS address for async loads and stores.
// It becomes incorrect when promoteConstantOffsetToImm adds an offset only
// meant for the global address operand. For async loads the LDS address is in
// vdst. For async stores, the LDS address is in vdata.
void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
int32_t OffsetDiff) const {
if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
return;

Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst));
MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (!LDSAddr)
LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
Comment thread
AlexAUT marked this conversation as resolved.
if (!LDSAddr)
return;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this just be an assert? Unless there's another case I'm not aware of?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM otherwise.


Register OldReg = LDSAddr->getReg();
Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst)
.addReg(OldVDst)
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg)
.addReg(OldReg)
.addImm(-OffsetDiff)
.addImm(0);

MI.getOperand(0).setReg(NewVDst);
LDSAddr->setReg(NewReg);
}

bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
Expand Down
48 changes: 48 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,51 @@ entry:

ret void
}

; Same as promote_async_load_offset_negative above, but for async stores. The
; LDS address is in vdata instead of vdst, so this tests that
; updateAsyncLDSAddress corrects the right operand.
define amdgpu_kernel void @promote_async_store_offset_negative(ptr addrspace(1) %dst) {
; GFX1250-LABEL: promote_async_store_offset_negative:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 0x100, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_async_from_lds_b128 v0, v1, s[0:1]
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff00
; GFX1250-NEXT: v_add_nc_u32_e64 v0, 0xfffffe00, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[2:3]
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v0, off offset:512
; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v1, off
; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.offset = shl i32 %tid, 0
%lds.gep = getelementptr i8, ptr addrspace(3) @lds, i32 0

; First store at base + 256
%offset0 = add i32 256, %gep.offset
%zext0 = zext i32 %offset0 to i64
%gep0 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext0
call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep0, ptr addrspace(3) %lds.gep, i32 0, i32 0)

; Second store at base + 512 (+512 from 0)
%offset1 = add i32 512, %gep.offset
%zext1 = zext i32 %offset1 to i64
%gep1 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext1
call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep1, ptr addrspace(3) %lds.gep, i32 0, i32 0)

; Final store at base + 0
%offset2 = add i32 0, %gep.offset
%zext2 = zext i32 %offset2 to i64
%gep2 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext2
call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep2, ptr addrspace(3) %lds.gep, i32 0, i32 0)

ret void
}
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,39 @@ body: |
GLOBAL_LOAD_ASYNC_TO_LDS_B128 %6, killed %15, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
S_ENDPGM 0
...

# Same as promote_async_load_offset above, but for async stores. The LDS address
# is in vdata instead of vdst, so this tests that updateAsyncLDSAddress corrects
# the right operand.
---

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you call out the line with the corrected offset and why it has this value in a comment?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the review, can you have another look? Both tests are essentially the same as the existing ones for async_load, the intent is just to verify that we adjust the correct operands and we no longer crash.

name: promote_async_store_offset
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr33'
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $vgpr0, $sgpr0_sgpr1, $ttmp7
; GFX1250-LABEL: name: promote_async_store_offset
; GFX1250: liveins: $ttmp7, $vgpr0, $sgpr0_sgpr1
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 8, $vgpr0, implicit $exec
; GFX1250-NEXT: renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 $vgpr0, 512, 0, implicit $exec
; GFX1250-NEXT: renamable $vgpr3, dead $sgpr_null = V_ADDC_U32_e64 0, killed $vgpr0, killed $vcc_lo, 0, implicit $exec
; GFX1250-NEXT: renamable $vgpr1 = disjoint V_OR_B32_e32 0, killed $vgpr1, implicit $exec
; GFX1250-NEXT: renamable $vgpr0 = V_ADD_U32_e32 256, $vgpr1, implicit $exec
; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 $vgpr2_vgpr3, killed $vgpr0, -256, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed $vgpr2_vgpr3, killed $vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_LSHLREV_B32_e64 8, %0, implicit $exec
%2:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
%3:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
%4:vgpr_32, %5:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 256, 0, implicit $exec
%6:vgpr_32, %7:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %5, 0, implicit $exec
%8:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %6, %subreg.sub1
GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %8, killed %2, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
%9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 512, 0, implicit $exec
%11:vgpr_32, %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %10, 0, implicit $exec
%13:vreg_64_align2 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %13, killed %3, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
...