[AMDGPU] Fix LDS address correction in promoteConstantOffsetToImm for async stores by AlexAUT · Pull Request #180220 · llvm/llvm-project

AlexAUT · 2026-02-06T15:58:31Z

updateAsyncLDSAddress, introduces by #176816, previously only handled async loads , where the LDS address is in the vdst operand. Therefore Async stores produced a nullptr dereference since the LDS address is in vdata for those instructions.

llvmbot · 2026-02-06T15:59:07Z

@llvm/pr-subscribers-backend-amdgpu

Author: Alexander Weinrauch (AlexAUT)

Changes

updateAsyncLDSAddress, introduces by #176816, previously only handled async loads , where the LDS address is in the vdst operand. Therefore Async stores produced a nullptr dereference since the LDS address is in vdata for those instructions.

Full diff: https://github.com/llvm/llvm-project/pull/180220.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+16-8)
(modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll (+46)
(modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir (+33)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 594c35a1dee3b..2d7870878aa11 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2369,24 +2369,32 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
 }
 
-// Maintain the correct LDS address for async loads.
-// It becomes incorrect when promoteConstantOffsetToImm
-// adds an offset only meant for the src operand.
+// Maintain the correct LDS address for async loads and stores.
+// It becomes incorrect when promoteConstantOffsetToImm adds an offset only
+// meant for the global address operand. For async loads the LDS address is in
+// vdst. For async stores, the LDS address is in vdata.
 void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
                                                  int32_t OffsetDiff) const {
   if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
     return;
 
-  Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
-  Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst));
+  MachineOperand *LDSAddr =
+      TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  if (!LDSAddr)
+    LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+  if (!LDSAddr)
+    return;
+
+  Register OldReg = LDSAddr->getReg();
+  Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
-  BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst)
-      .addReg(OldVDst)
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg)
+      .addReg(OldReg)
       .addImm(-OffsetDiff)
       .addImm(0);
 
-  MI.getOperand(0).setReg(NewVDst);
+  LDSAddr->setReg(NewReg);
 }
 
 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
index f557318cffb1f..318b5b5c6190b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
@@ -95,3 +95,49 @@ entry:
 
   ret void
 }
+
+; Test async_store_from_lds
+define amdgpu_kernel void @promote_async_store_offset_negative(ptr addrspace(1) %dst) {
+; GFX1250-LABEL: promote_async_store_offset_negative:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 0x100, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_async_from_lds_b128 v0, v1, s[0:1]
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
+; GFX1250-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff00
+; GFX1250-NEXT:    v_add_nc_u32_e64 v0, 0xfffffe00, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[2:3]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_async_from_lds_b128 v[2:3], v0, off offset:512
+; GFX1250-NEXT:    global_store_async_from_lds_b128 v[2:3], v1, off
+; GFX1250-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.offset = shl i32 %tid, 0
+  %lds.gep = getelementptr i8, ptr addrspace(3) @lds, i32 0
+
+  ; First store at base + 256
+  %offset0 = add i32 256, %gep.offset
+  %zext0 = zext i32 %offset0 to i64
+  %gep0 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext0
+  call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep0, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+  ; Second store at base + 512 (+512 from 0)
+  %offset1 = add i32 512, %gep.offset
+  %zext1 = zext i32 %offset1 to i64
+  %gep1 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext1
+  call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep1, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+  ; Final store at base + 0
+  %offset2 = add i32 0, %gep.offset
+  %zext2 = zext i32 %offset2 to i64
+  %gep2 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext2
+  call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep2, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
index 8d8c791fbd9ae..2baf0f1362819 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
@@ -109,3 +109,36 @@ body:             |
     GLOBAL_LOAD_ASYNC_TO_LDS_B128 %6, killed %15, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
     S_ENDPGM 0
 ...
+
+---
+name: promote_async_store_offset
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr0_sgpr1, $ttmp7
+    ; GFX1250-LABEL: name: promote_async_store_offset
+    ; GFX1250: liveins: $ttmp7, $vgpr0, $sgpr0_sgpr1
+    ; GFX1250-NEXT: {{  $}}
+    ; GFX1250-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 8, $vgpr0, implicit $exec
+    ; GFX1250-NEXT: renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 $vgpr0, 512, 0, implicit $exec
+    ; GFX1250-NEXT: renamable $vgpr3, dead $sgpr_null = V_ADDC_U32_e64 0, killed $vgpr0, killed $vcc_lo, 0, implicit $exec
+    ; GFX1250-NEXT: renamable $vgpr1 = disjoint V_OR_B32_e32 0, killed $vgpr1, implicit $exec
+    ; GFX1250-NEXT: renamable $vgpr0 = V_ADD_U32_e32 256, $vgpr1, implicit $exec
+    ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 $vgpr2_vgpr3, killed $vgpr0, -256, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+    ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed $vgpr2_vgpr3, killed $vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_LSHLREV_B32_e64 8, %0, implicit $exec
+    %2:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
+    %3:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
+    %4:vgpr_32, %5:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 256, 0, implicit $exec
+    %6:vgpr_32, %7:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %5, 0, implicit $exec
+    %8:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %6, %subreg.sub1
+    GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %8, killed %2, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+    %9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 512, 0, implicit $exec
+    %11:vgpr_32, %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %10, 0, implicit $exec
+    %13:vreg_64_align2 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
+    GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %13, killed %3, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+...

github-actions · 2026-02-06T16:00:17Z

✅ With the latest revision this PR passed the C/C++ code formatter.

qcolombet

LGTM with nits

qcolombet · 2026-02-06T17:02:33Z

    S_ENDPGM 0
 ...
+
+---


Could you call out the line with the corrected offset and why it has this value in a comment?

Thank you for the review, can you have another look? Both tests are essentially the same as the existing ones for async_load, the intent is just to verify that we adjust the correct operands and we no longer crash.

qcolombet · 2026-02-06T17:03:14Z

  ret void
 }
+
+; Test async_store_from_lds


Could you call out what is being tested exactly and what should be observed?

jayfoad · 2026-02-09T12:17:43Z

updateAsyncLDSAddress, introduces by #176816, previously only handled async loads , where the LDS address is in the vdst operand. Therefore Async stores produced a nullptr dereference since the LDS address is in vdata for those instructions.

@RyanRio any comments?

RyanRio · 2026-02-10T01:27:08Z

+  if (!LDSAddr)
+    return;


Shouldn't this just be an assert? Unless there's another case I'm not aware of?

LGTM otherwise.

Co-authored-by: Jay Foad <jay.foad@gmail.com>

github-actions · 2026-02-10T20:01:11Z

🐧 Linux x64 Test Results

189913 tests passed
5064 tests skipped

✅ The build succeeded and all tests passed.

github-actions · 2026-02-10T20:01:11Z

🪟 Windows x64 Test Results

130745 tests passed
2926 tests skipped

✅ The build succeeded and all tests passed.

This reverts commit dc37da6.

This reverts commit 69cd857.

This pulls in: - llvm/llvm-project#180220 - llvm/llvm-project#180268

This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083

This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083 (cherry picked from commit 552306f)

LLVM bump cherry pick for rel/3.7, this improve stability across ROCm archs. This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083 (cherry picked from commit 552306f)  # New contributor declaration - [ ] I am not making a trivial change, such as fixing a typo in a comment. - [ ] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) Co-authored-by: Alexander Weinrauch <alexander.weinrauch@amd.com>

This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083

Support async_store_from_lds

cd1cb0c

llvmbot added the backend:AMDGPU label Feb 6, 2026

Fix format

e91f6ad

qcolombet approved these changes Feb 6, 2026

View reviewed changes

Add comments to tests

6882d9b

jayfoad requested a review from arsenm February 9, 2026 12:17

RyanRio reviewed Feb 10, 2026

View reviewed changes

Assert LDS addr

9964052

jayfoad reviewed Feb 10, 2026

View reviewed changes

Comment thread llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

AlexAUT and others added 2 commits February 10, 2026 19:19

Style adjustment

69cd857

Co-authored-by: Jay Foad <jay.foad@gmail.com>

Fix format

dc37da6

AlexAUT added 2 commits February 10, 2026 20:25

Revert "Fix format"

a0c48c1

This reverts commit dc37da6.

Revert "Style adjustment"

047d2fa

This reverts commit 69cd857.

antiagainst merged commit 979132a into llvm:main Feb 10, 2026
10 checks passed

AlexAUT mentioned this pull request Feb 10, 2026

[LLVM-HEAD] Bump LLVM to 979132a triton-lang/triton#9426

Merged

antiagainst pushed a commit to triton-lang/triton that referenced this pull request Feb 10, 2026

[LLVM-HEAD] Bump LLVM to 979132a (#9426)

53ca298

This pulls in: - llvm/llvm-project#180220 - llvm/llvm-project#180268

AlexAUT mentioned this pull request Feb 11, 2026

[Backend] Bump to llvm/llvm-project@979132a triton-lang/triton#9431

Merged

jataylo mentioned this pull request Feb 16, 2026

[Backend] Bump to llvm/llvm-project@979132a (#9431) triton-lang/triton#9477

Merged

7 tasks

guacamoleo pushed a commit to ROCm/triton that referenced this pull request Apr 16, 2026

[Backend] Bump to llvm/llvm-project@979132a

3f5730c

This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Fix LDS address correction in promoteConstantOffsetToImm for async stores#180220

[AMDGPU] Fix LDS address correction in promoteConstantOffsetToImm for async stores#180220
antiagainst merged 8 commits into
llvm:mainfrom
AlexAUT:ldsOperandOffsetFixAsyncStore

AlexAUT commented Feb 6, 2026

Uh oh!

llvmbot commented Feb 6, 2026

Uh oh!

github-actions Bot commented Feb 6, 2026 •

edited

Loading

Uh oh!

qcolombet left a comment

Uh oh!

qcolombet Feb 6, 2026

Uh oh!

AlexAUT Feb 6, 2026

Uh oh!

qcolombet Feb 6, 2026

Uh oh!

jayfoad commented Feb 9, 2026

Uh oh!

RyanRio Feb 10, 2026

Uh oh!

RyanRio Feb 10, 2026

Uh oh!

Uh oh!

github-actions Bot commented Feb 10, 2026 •

edited

Loading

Uh oh!

github-actions Bot commented Feb 10, 2026 •

edited

Loading

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

Conversation

AlexAUT commented Feb 6, 2026

Uh oh!

llvmbot commented Feb 6, 2026

Uh oh!

github-actions Bot commented Feb 6, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

qcolombet left a comment

Choose a reason for hiding this comment

Uh oh!

qcolombet Feb 6, 2026

Choose a reason for hiding this comment

Uh oh!

AlexAUT Feb 6, 2026

Choose a reason for hiding this comment

Uh oh!

qcolombet Feb 6, 2026

Choose a reason for hiding this comment

Uh oh!

jayfoad commented Feb 9, 2026

Uh oh!

RyanRio Feb 10, 2026

Choose a reason for hiding this comment

Uh oh!

RyanRio Feb 10, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions Bot commented Feb 10, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

github-actions Bot commented Feb 10, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🪟 Windows x64 Test Results

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

github-actions Bot commented Feb 6, 2026 •

edited

Loading

github-actions Bot commented Feb 10, 2026 •

edited

Loading

github-actions Bot commented Feb 10, 2026 •

edited

Loading