-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca #157682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca #157682
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) Changes[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for Full diff: https://github.com/llvm/llvm-project/pull/157682.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 8617d868ef8ab..bb77cdff778c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -443,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return nullptr;
APInt IndexQuot;
- uint64_t Rem;
- APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
- if (Rem != 0)
+ APInt Rem;
+ APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
+ IndexQuot, Rem);
+ if (!Rem.isZero())
return nullptr;
if (VarOffsets.size() == 0)
return ConstantInt::get(GEP->getContext(), IndexQuot);
@@ -454,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
const auto &VarOffset = VarOffsets.front();
APInt OffsetQuot;
- APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
- if (Rem != 0 || OffsetQuot.isZero())
+ APInt::sdivrem(VarOffset.second,
+ APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
+ Rem);
+ if (!Rem.isZero() || OffsetQuot.isZero())
return nullptr;
Value *Offset = VarOffset.first;
@@ -465,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
if (!OffsetQuot.isOne()) {
ConstantInt *ConstMul =
- ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
+ ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
Offset = Builder.CreateMul(Offset, ConstMul);
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
NewInsts.push_back(NewInst);
@@ -474,7 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return Offset;
ConstantInt *ConstIndex =
- ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
+ ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
NewInsts.push_back(NewInst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
new file mode 100644
index 0000000000000..1b6ac0bd93c19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s
+
+; Check that the extracted index is correctly sign-extended when 32-bit scratch
+; address arithmetic is promoted to 64-bit vector index arithmetic.
+
+define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
+; CHECK-LABEL: @negative_index_byte(
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[STACK]], i8 0, i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT: store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [4 x i8], align 4, addrspace(5)
+ %gep.0 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 0
+ %gep.1 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+ %gep.2 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 2
+ %gep.3 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 3
+ store i8 0, ptr addrspace(5) %gep.0
+ store i8 1, ptr addrspace(5) %gep.1
+ store i8 2, ptr addrspace(5) %gep.2
+ store i8 3, ptr addrspace(5) %gep.3
+ %vgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 %offset
+ %cgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %vgep, i64 0, i64 -1
+ %load = load i8, ptr addrspace(5) %cgep
+ store i8 %load, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
+; CHECK-LABEL: @negative_index_word(
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i32> poison
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[STACK]], i32 0, i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT: store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [4 x i32], align 4, addrspace(5)
+ %gep.0 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 0
+ %gep.1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 1
+ %gep.2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 2
+ %gep.3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 3
+ store i32 0, ptr addrspace(5) %gep.0
+ store i32 1, ptr addrspace(5) %gep.1
+ store i32 2, ptr addrspace(5) %gep.2
+ store i32 3, ptr addrspace(5) %gep.3
+ %vgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 %offset
+ %cgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %vgep, i64 0, i64 -1
+ %load = load i32, ptr addrspace(5) %cgep
+ store i32 %load, ptr %out
+ ret void
+}
+
+
|
This relates to this discussion. See the diff from the first commit in the PR to see the previous, wrong behavior in the test. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Appreciate you fixing this, the libc
tests are very unorthodox as far as GPU targets go but they tend to hit weird behavior.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on allocas into i64 extractelement indices. Before this patch, negative GEP offsets would be zero-extended, leading to wrong extractelement indices with values around (2**32-1). This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for AMDGPU.
b32f5d1
to
64fab74
Compare
Rebase to get lit test fixes from 81a4fcb, to fix the premerge CI. |
Merge activity
|
APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem); | ||
if (Rem != 0) | ||
APInt Rem; | ||
APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you could still use the form of sdivrem that takes an int64_t dividor, right? Here and below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, I've missed that variant. I opened PR #157864 to address this and your other comment; I'd rather not revert this PR for NFC improvements since it fixed a buildbot failure.
if (!OffsetQuot.isOne()) { | ||
ConstantInt *ConstMul = | ||
ConstantInt::get(OffsetType, OffsetQuot.getZExtValue()); | ||
ConstantInt::get(OffsetType, OffsetQuot.getSExtValue()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems odd to convert APInt -> int64_t -> APInt here. Could you construct the ConstantInt directly from OffsetQuot.sext(OffsetType.getBitWidth()) ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See #157864
[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca
AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on
allocas into i64 extractelement indices. Before this patch, negative GEP
offsets would be zero-extended, leading to wrong extractelement indices
with values around (2**32-1).
This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for
AMDGPU.