-
Notifications
You must be signed in to change notification settings - Fork 11.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
release/19.x: [NVPTX] Fix internal indirect call prototypes not obeying the ABI (#100131) #100174
Conversation
@jhuber6 What do you think about merging this PR to the release branch? |
@llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-libc Author: None (llvmbot) ChangesBackport e0649a5 Requested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/100174.diff 3 Files Affected:
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 42909cec55890..fa878d8999227 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,13 +1,3 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
- set(extra_entrypoints
- # stdio.h entrypoints
- libc.src.stdio.snprintf
- libc.src.stdio.sprintf
- libc.src.stdio.vsnprintf
- libc.src.stdio.vsprintf
- )
-endif()
-
set(TARGET_LIBC_ENTRYPOINTS
# assert.h entrypoints
libc.src.assert.__assert_fail
@@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
libc.src.errno.errno
# stdio.h entrypoints
- ${extra_entrypoints}
libc.src.stdio.clearerr
libc.src.stdio.fclose
libc.src.stdio.printf
libc.src.stdio.vprintf
libc.src.stdio.fprintf
libc.src.stdio.vfprintf
+ libc.src.stdio.snprintf
+ libc.src.stdio.sprintf
+ libc.src.stdio.vsnprintf
+ libc.src.stdio.vsprintf
libc.src.stdio.feof
libc.src.stdio.ferror
libc.src.stdio.fflush
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 44c1a2e50486c..6975412ce5d35 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(
bool first = true;
- const Function *F = CB.getFunction();
unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
Type *Ty = Args[i].Ty;
@@ -1471,10 +1470,12 @@ std::string NVPTXTargetLowering::getPrototype(
continue;
}
+ // Indirect calls need strict ABI alignment so we disable optimizations by
+ // not providing a function to optimize.
Type *ETy = Args[i].IndirectType;
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
Align ParamByValAlign =
- getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
+ getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
O << ".param .align " << ParamByValAlign.value() << " .b8 ";
O << "_";
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
new file mode 100644
index 0000000000000..ac6c4e262fd60
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i8 }
+%struct.U = type { i64 }
+
+@ptr = external global ptr, align 8
+
+define internal i32 @foo() {
+; CHECK-LABEL: foo(
+; CHECK: {
+; CHECK-NEXT: .local .align 1 .b8 __local_depot0[2];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.u64 %SPL, __local_depot0;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT: ld.u8 %rs1, [%SP+1];
+; CHECK-NEXT: add.u64 %rd2, %SP, 0;
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .align 1 .b8 param0[1];
+; CHECK-NEXT: st.param.b8 [param0+0], %rs1;
+; CHECK-NEXT: .param .b64 param1;
+; CHECK-NEXT: st.param.b64 [param1+0], %rd2;
+; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
+; CHECK-NEXT: call (retval0),
+; CHECK-NEXT: %rd1,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: )
+; CHECK-NEXT: , prototype_0;
+; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %s = alloca %struct.S, align 1
+ %agg.tmp = alloca %struct.S, align 1
+ %0 = load ptr, ptr @ptr, align 8
+ %call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
+ ret i32 %call
+}
+
+define internal i32 @bar() {
+; CHECK-LABEL: bar(
+; CHECK: // @bar
+; CHECK-NEXT: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.u64 %SPL, __local_depot1;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT: ld.u64 %rd2, [%SP+8];
+; CHECK-NEXT: add.u64 %rd3, %SP, 0;
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.b64 [param0+0], %rd2;
+; CHECK-NEXT: .param .b64 param1;
+; CHECK-NEXT: st.param.b64 [param1+0], %rd3;
+; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
+; CHECK-NEXT: call (retval0),
+; CHECK-NEXT: %rd1,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: )
+; CHECK-NEXT: , prototype_1;
+; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %s = alloca %struct.U, align 8
+ %agg.tmp = alloca %struct.U, align 8
+ %0 = load ptr, ptr @ptr, align 8
+ %call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
+ ret i32 %call
+}
|
This should be merged |
…vm#100131) Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes llvm#100055 (cherry picked from commit e0649a5)
@jhuber6 (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
Backport e0649a5
Requested by: @jhuber6