diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 594c35a1dee3b..0141c365789ca 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2369,24 +2369,30 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); } -// Maintain the correct LDS address for async loads. -// It becomes incorrect when promoteConstantOffsetToImm -// adds an offset only meant for the src operand. +// Maintain the correct LDS address for async loads and stores. +// It becomes incorrect when promoteConstantOffsetToImm adds an offset only +// meant for the global address operand. For async loads the LDS address is in +// vdst. For async stores, the LDS address is in vdata. void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const { if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0) return; - Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); - Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst)); + MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (!LDSAddr) + LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); + assert(LDSAddr); + + Register OldReg = LDSAddr->getReg(); + Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg)); MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst) - .addReg(OldVDst) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg) + .addReg(OldReg) .addImm(-OffsetDiff) .addImm(0); - MI.getOperand(0).setReg(NewVDst); + LDSAddr->setReg(NewReg); } bool SILoadStoreOptimizer::promoteConstantOffsetToImm( diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll index f557318cffb1f..b5b0164969212 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll @@ -95,3 +95,51 @@ entry: ret void } + +; Same as promote_async_load_offset_negative above, but for async stores. The +; LDS address is in vdata instead of vdst, so this tests that +; updateAsyncLDSAddress corrects the right operand. +define amdgpu_kernel void @promote_async_store_offset_negative(ptr addrspace(1) %dst) { +; GFX1250-LABEL: promote_async_store_offset_negative: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 0x100, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_async_from_lds_b128 v0, v1, s[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] +; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff00 +; GFX1250-NEXT: v_add_nc_u32_e64 v0, 0xfffffe00, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[2:3] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v0, off offset:512 +; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v1, off +; GFX1250-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.offset = shl i32 %tid, 0 + %lds.gep = getelementptr i8, ptr addrspace(3) @lds, i32 0 + + ; First store at base + 256 + %offset0 = add i32 256, %gep.offset + %zext0 = zext i32 %offset0 to i64 + %gep0 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext0 + call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep0, ptr addrspace(3) %lds.gep, i32 0, i32 0) + + ; Second store at base + 512 (+512 from 0) + %offset1 = add i32 512, %gep.offset + %zext1 = zext i32 %offset1 to i64 + %gep1 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext1 + call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep1, ptr addrspace(3) %lds.gep, i32 0, i32 0) + + ; Final store at base + 0 + %offset2 = add i32 0, %gep.offset + %zext2 = zext i32 %offset2 to i64 + %gep2 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext2 + call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep2, ptr addrspace(3) %lds.gep, i32 0, i32 0) + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir index 8d8c791fbd9ae..dab161ba080e5 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir @@ -109,3 +109,39 @@ body: | GLOBAL_LOAD_ASYNC_TO_LDS_B128 %6, killed %15, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) S_ENDPGM 0 ... + +# Same as promote_async_load_offset above, but for async stores. The LDS address +# is in vdata instead of vdst, so this tests that updateAsyncLDSAddress corrects +# the right operand. +--- +name: promote_async_store_offset +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $vgpr0, $sgpr0_sgpr1, $ttmp7 + ; GFX1250-LABEL: name: promote_async_store_offset + ; GFX1250: liveins: $ttmp7, $vgpr0, $sgpr0_sgpr1 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 8, $vgpr0, implicit $exec + ; GFX1250-NEXT: renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 $vgpr0, 512, 0, implicit $exec + ; GFX1250-NEXT: renamable $vgpr3, dead $sgpr_null = V_ADDC_U32_e64 0, killed $vgpr0, killed $vcc_lo, 0, implicit $exec + ; GFX1250-NEXT: renamable $vgpr1 = disjoint V_OR_B32_e32 0, killed $vgpr1, implicit $exec + ; GFX1250-NEXT: renamable $vgpr0 = V_ADD_U32_e32 256, $vgpr1, implicit $exec + ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 $vgpr2_vgpr3, killed $vgpr0, -256, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) + ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed $vgpr2_vgpr3, killed $vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_LSHLREV_B32_e64 8, %0, implicit $exec + %2:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec + %3:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec + %4:vgpr_32, %5:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 256, 0, implicit $exec + %6:vgpr_32, %7:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %5, 0, implicit $exec + %8:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %6, %subreg.sub1 + GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %8, killed %2, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) + %9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 512, 0, implicit $exec + %11:vgpr_32, %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %10, 0, implicit $exec + %13:vreg_64_align2 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %13, killed %3, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3) +...