Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release/19.x: [NVPTX] Fix internal indirect call prototypes not obeying the ABI (#100131) #100174

Merged
merged 1 commit into from
Jul 24, 2024

Conversation

llvmbot
Copy link
Collaborator

@llvmbot llvmbot commented Jul 23, 2024

Backport e0649a5

Requested by: @jhuber6

@llvmbot llvmbot added this to the LLVM 19.X Release milestone Jul 23, 2024
@llvmbot
Copy link
Collaborator Author

llvmbot commented Jul 23, 2024

@jhuber6 What do you think about merging this PR to the release branch?

@llvmbot
Copy link
Collaborator Author

llvmbot commented Jul 23, 2024

@llvm/pr-subscribers-backend-nvptx

@llvm/pr-subscribers-libc

Author: None (llvmbot)

Changes

Backport e0649a5

Requested by: @jhuber6


Full diff: https://github.com/llvm/llvm-project/pull/100174.diff

3 Files Affected:

  • (modified) libc/config/gpu/entrypoints.txt (+4-11)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+3-2)
  • (added) llvm/test/CodeGen/NVPTX/indirect_byval.ll (+94)
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 42909cec55890..fa878d8999227 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,13 +1,3 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(extra_entrypoints
-      # stdio.h entrypoints
-      libc.src.stdio.snprintf
-      libc.src.stdio.sprintf
-      libc.src.stdio.vsnprintf
-      libc.src.stdio.vsprintf
-  )
-endif()
-
 set(TARGET_LIBC_ENTRYPOINTS
     # assert.h entrypoints
     libc.src.assert.__assert_fail
@@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.errno.errno
 
     # stdio.h entrypoints
-    ${extra_entrypoints}
     libc.src.stdio.clearerr
     libc.src.stdio.fclose
     libc.src.stdio.printf
     libc.src.stdio.vprintf
     libc.src.stdio.fprintf
     libc.src.stdio.vfprintf
+    libc.src.stdio.snprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 44c1a2e50486c..6975412ce5d35 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(
 
   bool first = true;
 
-  const Function *F = CB.getFunction();
   unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
   for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
     Type *Ty = Args[i].Ty;
@@ -1471,10 +1470,12 @@ std::string NVPTXTargetLowering::getPrototype(
       continue;
     }
 
+    // Indirect calls need strict ABI alignment so we disable optimizations by
+    // not providing a function to optimize.
     Type *ETy = Args[i].IndirectType;
     Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
     Align ParamByValAlign =
-        getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
+        getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
 
     O << ".param .align " << ParamByValAlign.value() << " .b8 ";
     O << "_";
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
new file mode 100644
index 0000000000000..ac6c4e262fd60
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i8 }
+%struct.U = type { i64 }
+
+@ptr = external global ptr, align 8
+
+define internal i32 @foo() {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 1 .b8 __local_depot0[2];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    mov.u64 %SPL, __local_depot0;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.u8 %rs1, [%SP+1];
+; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 1 .b8 param0[1];
+; CHECK-NEXT:    st.param.b8 [param0+0], %rs1;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    st.param.b64 [param1+0], %rd2;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd1,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_0;
+; CHECK-NEXT:    ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+entry:
+  %s = alloca %struct.S, align 1
+  %agg.tmp = alloca %struct.S, align 1
+  %0 = load ptr, ptr @ptr, align 8
+  %call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
+  ret i32 %call
+}
+
+define internal i32 @bar() {
+; CHECK-LABEL: bar(
+; CHECK:         // @bar
+; CHECK-NEXT:  {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot1[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    mov.u64 %SPL, __local_depot1;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.u64 %rd2, [%SP+8];
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0+0], %rd2;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    st.param.b64 [param1+0], %rd3;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd1,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_1;
+; CHECK-NEXT:    ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+entry:
+  %s = alloca %struct.U, align 8
+  %agg.tmp = alloca %struct.U, align 8
+  %0 = load ptr, ptr @ptr, align 8
+  %call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
+  ret i32 %call
+}

@jhuber6
Copy link
Contributor

jhuber6 commented Jul 23, 2024

This should be merged

…vm#100131)

Summary:
The NVPTX backend optimizes the ABI for functions that are internal,
however, this is not legal for indirect call prototypes. Previously, we
would modify the ABI on an aggregate byval type passed to an indirect
call prototype, which would make PTXAS error. This patch just passes the
function as a nullptr to force strict ABI compliance without
modification in the helper function.

Fixes llvm#100055

(cherry picked from commit e0649a5)
@tru tru merged commit dcc22f9 into llvm:release/19.x Jul 24, 2024
6 of 9 checks passed
Copy link

@jhuber6 (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
Development

Successfully merging this pull request may close these issues.

3 participants