[AMDGPU] Fix LDS address correction in promoteConstantOffsetToImm for async stores#180220
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Alexander Weinrauch (AlexAUT) Changes
Full diff: https://github.com/llvm/llvm-project/pull/180220.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 594c35a1dee3b..2d7870878aa11 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2369,24 +2369,32 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
}
-// Maintain the correct LDS address for async loads.
-// It becomes incorrect when promoteConstantOffsetToImm
-// adds an offset only meant for the src operand.
+// Maintain the correct LDS address for async loads and stores.
+// It becomes incorrect when promoteConstantOffsetToImm adds an offset only
+// meant for the global address operand. For async loads the LDS address is in
+// vdst. For async stores, the LDS address is in vdata.
void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
int32_t OffsetDiff) const {
if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
return;
- Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
- Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst));
+ MachineOperand *LDSAddr =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (!LDSAddr)
+ LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ if (!LDSAddr)
+ return;
+
+ Register OldReg = LDSAddr->getReg();
+ Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst)
- .addReg(OldVDst)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg)
+ .addReg(OldReg)
.addImm(-OffsetDiff)
.addImm(0);
- MI.getOperand(0).setReg(NewVDst);
+ LDSAddr->setReg(NewReg);
}
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
index f557318cffb1f..318b5b5c6190b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
@@ -95,3 +95,49 @@ entry:
ret void
}
+
+; Test async_store_from_lds
+define amdgpu_kernel void @promote_async_store_offset_negative(ptr addrspace(1) %dst) {
+; GFX1250-LABEL: promote_async_store_offset_negative:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 0x100, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_async_from_lds_b128 v0, v1, s[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
+; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff00
+; GFX1250-NEXT: v_add_nc_u32_e64 v0, 0xfffffe00, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[2:3]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v0, off offset:512
+; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v1, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.offset = shl i32 %tid, 0
+ %lds.gep = getelementptr i8, ptr addrspace(3) @lds, i32 0
+
+ ; First store at base + 256
+ %offset0 = add i32 256, %gep.offset
+ %zext0 = zext i32 %offset0 to i64
+ %gep0 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext0
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep0, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+ ; Second store at base + 512 (+512 from 0)
+ %offset1 = add i32 512, %gep.offset
+ %zext1 = zext i32 %offset1 to i64
+ %gep1 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext1
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep1, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+ ; Final store at base + 0
+ %offset2 = add i32 0, %gep.offset
+ %zext2 = zext i32 %offset2 to i64
+ %gep2 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext2
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep2, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
index 8d8c791fbd9ae..2baf0f1362819 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
@@ -109,3 +109,36 @@ body: |
GLOBAL_LOAD_ASYNC_TO_LDS_B128 %6, killed %15, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
S_ENDPGM 0
...
+
+---
+name: promote_async_store_offset
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1, $ttmp7
+ ; GFX1250-LABEL: name: promote_async_store_offset
+ ; GFX1250: liveins: $ttmp7, $vgpr0, $sgpr0_sgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 8, $vgpr0, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 $vgpr0, 512, 0, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr3, dead $sgpr_null = V_ADDC_U32_e64 0, killed $vgpr0, killed $vcc_lo, 0, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr1 = disjoint V_OR_B32_e32 0, killed $vgpr1, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr0 = V_ADD_U32_e32 256, $vgpr1, implicit $exec
+ ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 $vgpr2_vgpr3, killed $vgpr0, -256, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+ ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed $vgpr2_vgpr3, killed $vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_LSHLREV_B32_e64 8, %0, implicit $exec
+ %2:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
+ %3:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
+ %4:vgpr_32, %5:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 256, 0, implicit $exec
+ %6:vgpr_32, %7:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %5, 0, implicit $exec
+ %8:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %6, %subreg.sub1
+ GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %8, killed %2, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+ %9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 512, 0, implicit $exec
+ %11:vgpr_32, %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %10, 0, implicit $exec
+ %13:vreg_64_align2 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
+ GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %13, killed %3, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+...
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
| S_ENDPGM 0 | ||
| ... | ||
|
|
||
| --- |
There was a problem hiding this comment.
Could you call out the line with the corrected offset and why it has this value in a comment?
There was a problem hiding this comment.
Thank you for the review, can you have another look? Both tests are essentially the same as the existing ones for async_load, the intent is just to verify that we adjust the correct operands and we no longer crash.
| ret void | ||
| } | ||
|
|
||
| ; Test async_store_from_lds |
There was a problem hiding this comment.
Could you call out what is being tested exactly and what should be observed?
| if (!LDSAddr) | ||
| return; |
There was a problem hiding this comment.
Shouldn't this just be an assert? Unless there's another case I'm not aware of?
Co-authored-by: Jay Foad <jay.foad@gmail.com>
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083 (cherry picked from commit 552306f)
LLVM bump cherry pick for rel/3.7, this improve stability across ROCm archs. This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083 (cherry picked from commit 552306f) <!--- The core Triton is a small number of people, and we receive many PRs (thank you!). To help us review your code more quickly, **if you are a new contributor (less than 3 PRs merged) we ask that you complete the following tasks and include the filled-out checklist in your PR description.** Complete the following tasks before sending your PR, and replace `[ ]` with `[x]` to indicate you have done them. --> # New contributor declaration - [ ] I am not making a trivial change, such as fixing a typo in a comment. - [ ] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) Co-authored-by: Alexander Weinrauch <alexander.weinrauch@amd.com>
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083
updateAsyncLDSAddress, introduces by #176816, previously only handled async loads , where the LDS address is in thevdstoperand. Therefore Async stores produced a nullptr dereference since the LDS address is invdatafor those instructions.