[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca #157682

ritter-x2a · 2025-09-09T14:44:58Z

[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca

AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on
allocas into i64 extractelement indices. Before this patch, negative GEP
offsets would be zero-extended, leading to wrong extractelement indices
with values around (2**32-1).

This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for
AMDGPU.

ritter-x2a · 2025-09-09T14:45:16Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-09-09T14:48:53Z

@llvm/pr-subscribers-backend-amdgpu

Author: Fabian Ritter (ritter-x2a)

Changes

[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca

AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on
allocas into i64 extractelement indices. Before this patch, negative GEP
offsets would be zero-extended, leading to wrong extractelement indices
with values around (2**32-1).

This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for
AMDGPU.

Full diff: https://github.com/llvm/llvm-project/pull/157682.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+10-7)
(added) llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll (+63)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 8617d868ef8ab..bb77cdff778c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -443,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return nullptr;
 
   APInt IndexQuot;
-  uint64_t Rem;
-  APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
-  if (Rem != 0)
+  APInt Rem;
+  APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
+                 IndexQuot, Rem);
+  if (!Rem.isZero())
     return nullptr;
   if (VarOffsets.size() == 0)
     return ConstantInt::get(GEP->getContext(), IndexQuot);
@@ -454,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
-  APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
-  if (Rem != 0 || OffsetQuot.isZero())
+  APInt::sdivrem(VarOffset.second,
+                 APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
+                 Rem);
+  if (!Rem.isZero() || OffsetQuot.isZero())
     return nullptr;
 
   Value *Offset = VarOffset.first;
@@ -465,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   if (!OffsetQuot.isOne()) {
     ConstantInt *ConstMul =
-        ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
+        ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
     Offset = Builder.CreateMul(Offset, ConstMul);
     if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
       NewInsts.push_back(NewInst);
@@ -474,7 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return Offset;
 
   ConstantInt *ConstIndex =
-      ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
+      ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
   Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
   if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
     NewInsts.push_back(NewInst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
new file mode 100644
index 0000000000000..1b6ac0bd93c19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s
+
+; Check that the extracted index is correctly sign-extended when 32-bit scratch
+; address arithmetic is promoted to 64-bit vector index arithmetic.
+
+define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
+; CHECK-LABEL: @negative_index_byte(
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> [[STACK]], i8 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [4 x i8], align 4, addrspace(5)
+  %gep.0 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 0
+  %gep.1 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %gep.2 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 2
+  %gep.3 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 3
+  store i8 0, ptr addrspace(5) %gep.0
+  store i8 1, ptr addrspace(5) %gep.1
+  store i8 2, ptr addrspace(5) %gep.2
+  store i8 3, ptr addrspace(5) %gep.3
+  %vgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 %offset
+  %cgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %vgep, i64 0, i64 -1
+  %load = load i8, ptr addrspace(5) %cgep
+  store i8 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
+; CHECK-LABEL: @negative_index_word(
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i32> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[STACK]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep.0 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 0
+  %gep.1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 1
+  %gep.2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 2
+  %gep.3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 3
+  store i32 0, ptr addrspace(5) %gep.0
+  store i32 1, ptr addrspace(5) %gep.1
+  store i32 2, ptr addrspace(5) %gep.2
+  store i32 3, ptr addrspace(5) %gep.3
+  %vgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 %offset
+  %cgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %vgep, i64 0, i64 -1
+  %load = load i32, ptr addrspace(5) %cgep
+  store i32 %load, ptr %out
+  ret void
+}
+
+

ritter-x2a · 2025-09-09T14:51:47Z

This relates to this discussion. See the diff from the first commit in the PR to see the previous, wrong behavior in the test.

nikic

LGTM

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

jhuber6

Appreciate you fixing this, the libc tests are very unorthodox as far as GPU targets go but they tend to hit weird behavior.

perlfu

LGTM

AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on allocas into i64 extractelement indices. Before this patch, negative GEP offsets would be zero-extended, leading to wrong extractelement indices with values around (2**32-1). This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for AMDGPU.

ritter-x2a · 2025-09-10T08:32:31Z

Rebase to get lit test fixes from 81a4fcb, to fix the premerge CI.

ritter-x2a · 2025-09-10T09:31:10Z

Merge activity

Sep 10, 9:31 AM UTC: A user started a stack merge that includes this pull request via Graphite.
Sep 10, 9:32 AM UTC: @ritter-x2a merged this pull request with Graphite.

jayfoad · 2025-09-10T12:29:49Z

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

-  APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
-  if (Rem != 0)
+  APInt Rem;
+  APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),


I think you could still use the form of sdivrem that takes an int64_t dividor, right? Here and below.

Indeed, I've missed that variant. I opened PR #157864 to address this and your other comment; I'd rather not revert this PR for NFC improvements since it fixed a buildbot failure.

jayfoad · 2025-09-10T12:32:01Z

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

  if (!OffsetQuot.isOne()) {
    ConstantInt *ConstMul =
-        ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
+        ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());


It seems odd to convert APInt -> int64_t -> APInt here. Could you construct the ConstantInt directly from OffsetQuot.sext(OffsetType.getBitWidth()) ?

See #157864

Follow-up to #157682

#157864) Follow-up to #157682

ritter-x2a added the backend:AMDGPU label Sep 9, 2025 — with Graphite App

ritter-x2a requested review from arsenm, jayfoad, jhuber6, jplehr, nikic, perlfu and shiltian September 9, 2025 14:49

ritter-x2a marked this pull request as ready for review September 9, 2025 14:50

ritter-x2a mentioned this pull request Sep 9, 2025

[InstCombine] Strip leading zero indices from GEP #155415

Merged

nikic approved these changes Sep 9, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Show resolved Hide resolved

jhuber6 approved these changes Sep 9, 2025

View reviewed changes

perlfu approved these changes Sep 9, 2025

View reviewed changes

ritter-x2a added 2 commits September 10, 2025 02:27

Add the actual implementation, which fixes the test.

64fab74

ritter-x2a force-pushed the users/ritter-x2a/09-09-_amdgpu_treat_gep_offsets_as_signed_in_amdgpupromotealloca branch from b32f5d1 to 64fab74 Compare September 10, 2025 08:31

ritter-x2a mentioned this pull request Sep 10, 2025

[AMDGPU] Generate canonical additions in AMDGPUPromoteAlloca #157810

Merged

ritter-x2a merged commit b965f26 into main Sep 10, 2025
9 checks passed

ritter-x2a deleted the users/ritter-x2a/09-09-_amdgpu_treat_gep_offsets_as_signed_in_amdgpupromotealloca branch September 10, 2025 09:32

jayfoad reviewed Sep 10, 2025

View reviewed changes

ritter-x2a mentioned this pull request Sep 10, 2025

[AMDGPUPromoteAlloca][NFC] Avoid unnecessary APInt/int64_t conversions #157864

Merged

ritter-x2a added a commit that referenced this pull request Sep 10, 2025

[AMDGPUPromoteAlloca][NFC] Avoid unnecessary APInt/int64_t conversions

5494556

Follow-up to #157682

ritter-x2a added a commit that referenced this pull request Sep 12, 2025

[AMDGPUPromoteAlloca][NFC] Avoid unnecessary APInt/int64_t conversions (

7982980

#157864) Follow-up to #157682

[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca #157682

[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca #157682

Uh oh!

Conversation

ritter-x2a commented Sep 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ritter-x2a commented Sep 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 9, 2025

Uh oh!

ritter-x2a commented Sep 9, 2025

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

perlfu left a comment

Choose a reason for hiding this comment

Uh oh!

ritter-x2a commented Sep 10, 2025

Uh oh!

ritter-x2a commented Sep 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

jayfoad Sep 10, 2025

Choose a reason for hiding this comment

Uh oh!

ritter-x2a Sep 10, 2025

Choose a reason for hiding this comment

Uh oh!

jayfoad Sep 10, 2025

Choose a reason for hiding this comment

Uh oh!

ritter-x2a Sep 10, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

ritter-x2a commented Sep 9, 2025 •

edited

Loading

ritter-x2a commented Sep 9, 2025 •

edited

Loading

ritter-x2a commented Sep 10, 2025 •

edited

Loading