From cbfdca3b7d1964b19671de50c133fd20d400bae7 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 23 Jul 2024 09:36:52 -0500 Subject: [PATCH] [NVPTX] Fix internal indirect call prototypes not obeying the ABI Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes https://github.com/llvm/llvm-project/issues/100055 --- libc/config/gpu/entrypoints.txt | 15 +--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 +- llvm/test/CodeGen/NVPTX/indirect_byval.ll | 94 +++++++++++++++++++++ 3 files changed, 100 insertions(+), 13 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/indirect_byval.ll diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 42909cec558908..fa878d89992276 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -1,13 +1,3 @@ -if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) - set(extra_entrypoints - # stdio.h entrypoints - libc.src.stdio.snprintf - libc.src.stdio.sprintf - libc.src.stdio.vsnprintf - libc.src.stdio.vsprintf - ) -endif() - set(TARGET_LIBC_ENTRYPOINTS # assert.h entrypoints libc.src.assert.__assert_fail @@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.errno.errno # stdio.h entrypoints - ${extra_entrypoints} libc.src.stdio.clearerr libc.src.stdio.fclose libc.src.stdio.printf libc.src.stdio.vprintf libc.src.stdio.fprintf libc.src.stdio.vfprintf + libc.src.stdio.snprintf + libc.src.stdio.sprintf + libc.src.stdio.vsnprintf + libc.src.stdio.vsprintf libc.src.stdio.feof libc.src.stdio.ferror libc.src.stdio.fflush diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 44c1a2e50486c5..bcd2ee2f4fc1e7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype( bool first = true; - const Function *F = CB.getFunction(); unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { Type *Ty = Args[i].Ty; @@ -1471,10 +1470,11 @@ std::string NVPTXTargetLowering::getPrototype( continue; } + // Indirect calls need strict ABI alignment so we disable optimizations. Type *ETy = Args[i].IndirectType; Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); Align ParamByValAlign = - getFunctionByValParamAlign(F, ETy, InitialAlign, DL); + getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL); O << ".param .align " << ParamByValAlign.value() << " .b8 "; O << "_"; diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll new file mode 100644 index 00000000000000..ac6c4e262fd60e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +%struct.S = type { i8 } +%struct.U = type { i64 } + +@ptr = external global ptr, align 8 + +define internal i32 @foo() { +; CHECK-LABEL: foo( +; CHECK: { +; CHECK-NEXT: .local .align 1 .b8 __local_depot0[2]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: mov.u64 %SPL, __local_depot0; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.global.u64 %rd1, [ptr]; +; CHECK-NEXT: ld.u8 %rs1, [%SP+1]; +; CHECK-NEXT: add.u64 %rd2, %SP, 0; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .align 1 .b8 param0[1]; +; CHECK-NEXT: st.param.b8 [param0+0], %rs1; +; CHECK-NEXT: .param .b64 param1; +; CHECK-NEXT: st.param.b64 [param1+0], %rd2; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _); +; CHECK-NEXT: call (retval0), +; CHECK-NEXT: %rd1, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ) +; CHECK-NEXT: , prototype_0; +; CHECK-NEXT: ld.param.b32 %r1, [retval0+0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: ret; +entry: + %s = alloca %struct.S, align 1 + %agg.tmp = alloca %struct.S, align 1 + %0 = load ptr, ptr @ptr, align 8 + %call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s) + ret i32 %call +} + +define internal i32 @bar() { +; CHECK-LABEL: bar( +; CHECK: // @bar +; CHECK-NEXT: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: mov.u64 %SPL, __local_depot1; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.global.u64 %rd1, [ptr]; +; CHECK-NEXT: ld.u64 %rd2, [%SP+8]; +; CHECK-NEXT: add.u64 %rd3, %SP, 0; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0+0], %rd2; +; CHECK-NEXT: .param .b64 param1; +; CHECK-NEXT: st.param.b64 [param1+0], %rd3; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _); +; CHECK-NEXT: call (retval0), +; CHECK-NEXT: %rd1, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ) +; CHECK-NEXT: , prototype_1; +; CHECK-NEXT: ld.param.b32 %r1, [retval0+0]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: ret; +entry: + %s = alloca %struct.U, align 8 + %agg.tmp = alloca %struct.U, align 8 + %0 = load ptr, ptr @ptr, align 8 + %call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s) + ret i32 %call +}