Skip to content

Fix cuda.jit decorator inline#181

Merged
gmarkall merged 10 commits intoNVIDIA:mainfrom
ZzEeKkAa:yhavrylko/fix/jit_inline
May 1, 2025
Merged

Fix cuda.jit decorator inline#181
gmarkall merged 10 commits intoNVIDIA:mainfrom
ZzEeKkAa:yhavrylko/fix/jit_inline

Conversation

@ZzEeKkAa
Copy link
Contributor

In the @cuda.jit decorator inline is being ignored and unused. Because it is explicitly declared in the function signature it is not part of the kws and is not passed further to the dispatcher. This PR fixes it by removing it from function signature.

Here is an affected example:

from numba import cuda
import numpy as np

@cuda.jit(inline="always")
def set_zero(a):
    a[0]=0

@cuda.jit
def f(a):
    set_zero(a)


a = np.ones(2)

f[1, 2](a)

print(a)

optimized LLVM code before the change:

LLVM DUMP: Post kernel fixup _ZN8__main__1fB2v1B92cw51cXTLSUwv1sDUaKthoaNgqamjgBQYtuBwBgY3WszbglIhRBiSyJAEUBMOTALJ3bbV6rDgVbdSgEaTOlASyDOu1QQAE5ArrayIdLi1E1C7mutable7alignedE
; ModuleID = "f$1"
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

@"_ZN08NumbaEnv8__main__1fB2v1B92cw51cXTLSUwv1sDUaKthoaNgqamjgBQYtuBwBgY3WszbglIhRBiSyJAEUBMOTALJ3bbV6rDgVbdSgEaTOlASyDOu1QQAE5ArrayIdLi1E1C7mutable7alignedE" = common global i8* null
define void @"_ZN8__main__1fB2v1B92cw51cXTLSUwv1sDUaKthoaNgqamjgBQYtuBwBgY3WszbglIhRBiSyJAEUBMOTALJ3bbV6rDgVbdSgEaTOlASyDOu1QQAE5ArrayIdLi1E1C7mutable7alignedE"(i8* %"arg.a.0", i8* %"arg.a.1", i64 %"arg.a.2", i64 %"arg.a.3", double* %"arg.a.4", i64 %"arg.a.5.0", i64 %"arg.a.6.0")
{
entry:
  %"inserted.meminfo" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} undef, i8* %"arg.a.0", 0
  %"inserted.parent" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.meminfo", i8* %"arg.a.1", 1
  %"inserted.nitems" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.parent", i64 %"arg.a.2", 2
  %"inserted.itemsize" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.nitems", i64 %"arg.a.3", 3
  %"inserted.data" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.itemsize", double* %"arg.a.4", 4
  %".10" = insertvalue [1 x i64] undef, i64 %"arg.a.5.0", 0
  %"inserted.shape" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.data", [1 x i64] %".10", 5
  %".11" = insertvalue [1 x i64] undef, i64 %"arg.a.6.0", 0
  %"inserted.strides" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.shape", [1 x i64] %".11", 6
  %".12" = alloca i8*
  store i8* null, i8** %".12"
  br label %"B0"
B0:
  store i8* null, i8** %".12"
  %"extracted.meminfo" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 0
  %"extracted.parent" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 1
  %"extracted.nitems" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 2
  %"extracted.itemsize" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 3
  %"extracted.data" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 4
  %"extracted.shape" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 5
  %".15" = extractvalue [1 x i64] %"extracted.shape", 0
  %"extracted.strides" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 6
  %".16" = extractvalue [1 x i64] %"extracted.strides", 0
  %".17" = call i32 @"_ZN8__main__8set_zeroB2v2B100cw51cXTLSUwv1sDUaKthoaNgqamjgBQYtuBwBgY3WszbglIhRBiSyJAEUBMOTALJ3bbV6kCWupWCsY6COiyggVywC2s1AQ_3d_3dE5ArrayIdLi1E1C7mutable7alignedE"(i8** %".12", i8* %"extracted.meminfo", i8* %"extracted.parent", i64 %"extracted.nitems", i64 %"extracted.itemsize", double* %"extracted.data", i64 %".15", i64 %".16")
  %".18" = icmp eq i32 %".17", 0
  %".19" = icmp eq i32 %".17", -2
  %".20" = or i1 %".18", %".19"
  %".21" = xor i1 %".20", -1
  %".22" = icmp eq i32 %".17", -1
  %".23" = icmp eq i32 %".17", -3
  %".24" = icmp sge i32 %".17", 1
  %".25" = load i8*, i8** %".12"
  br i1 %".21", label %"B0.if", label %"B0.endif", !prof !1
B0.if:
  ret void
B0.endif:
  ret void
}

declare i32 @"_ZN8__main__8set_zeroB2v2B100cw51cXTLSUwv1sDUaKthoaNgqamjgBQYtuBwBgY3WszbglIhRBiSyJAEUBMOTALJ3bbV6kCWupWCsY6COiyggVywC2s1AQ_3d_3dE5ArrayIdLi1E1C7mutable7alignedE"(i8** %".ret", i8* %"arg.a.0", i8* %"arg.a.1", i64 %"arg.a.2", i64 %"arg.a.3", double* %"arg.a.4", i64 %"arg.a.5.0", i64 %"arg.a.6.0")

@"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN8__main__1fB2v1B92cw51cXTLSUwv1sDUaKthoaNgqamjgBQYtuBwBgY3WszbglIhRBiSyJAEUBMOTALJ3bbV6rDgVbdSgEaTOlASyDOu1QQAE5ArrayIdLi1E1C7mutable7alignedE" to i8*)], section "llvm.metadata"
!nvvmir.version = !{ !0 }
!nvvm.annotations = !{ !2 }
!0 = !{ i32 2, i32 0, i32 3, i32 1 }
!1 = !{ !"branch_weights", i32 1, i32 99 }
!2 = !{ void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN8__main__1fB2v1B92cw51cXTLSUwv1sDUaKthoaNgqamjgBQYtuBwBgY3WszbglIhRBiSyJAEUBMOTALJ3bbV6rDgVbdSgEaTOlASyDOu1QQAE5ArrayIdLi1E1C7mutable7alignedE", !"kernel", i32 1 }

optimized LLVM code after the change:

LLVM DUMP: Post kernel fixup _ZN8__main__1fB2v1B104cw51cXTLSUwv1sDUaKthoaNgqamjgEhrsDBECh5bcMgDIwAtLdiC0iVEGJLskARQkxJMAsknttXqsABXt1KAWqoOlATyjGs1AQ_3d_3dE5ArrayIdLi1E1C7mutable7alignedE
; ModuleID = "f$1"
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

@"_ZN08NumbaEnv8__main__1fB2v1B104cw51cXTLSUwv1sDUaKthoaNgqamjgEhrsDBECh5bcMgDIwAtLdiC0iVEGJLskARQkxJMAsknttXqsABXt1KAWqoOlATyjGs1AQ_3d_3dE5ArrayIdLi1E1C7mutable7alignedE" = common global i8* null
define void @"_ZN8__main__1fB2v1B104cw51cXTLSUwv1sDUaKthoaNgqamjgEhrsDBECh5bcMgDIwAtLdiC0iVEGJLskARQkxJMAsknttXqsABXt1KAWqoOlATyjGs1AQ_3d_3dE5ArrayIdLi1E1C7mutable7alignedE"(i8* %"arg.a.0", i8* %"arg.a.1", i64 %"arg.a.2", i64 %"arg.a.3", double* %"arg.a.4", i64 %"arg.a.5.0", i64 %"arg.a.6.0")
{
entry:
  %"inserted.meminfo" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} undef, i8* %"arg.a.0", 0
  %"inserted.parent" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.meminfo", i8* %"arg.a.1", 1
  %"inserted.nitems" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.parent", i64 %"arg.a.2", 2
  %"inserted.itemsize" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.nitems", i64 %"arg.a.3", 3
  %"inserted.data" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.itemsize", double* %"arg.a.4", 4
  %".10" = insertvalue [1 x i64] undef, i64 %"arg.a.5.0", 0
  %"inserted.shape" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.data", [1 x i64] %".10", 5
  %".11" = insertvalue [1 x i64] undef, i64 %"arg.a.6.0", 0
  %"inserted.strides" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.shape", [1 x i64] %".11", 6
  %".12" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12"
  br label %"B0"
B0:
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12"
  %".16" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12", i32 0, i32 5
  %".17" = getelementptr inbounds [1 x i64], [1 x i64]* %".16", i32 0, i32 0
  %".18" = load i64, i64* %".17", !range !1
  %".19" = insertvalue [1 x i64] undef, i64 %".18", 0
  %".20" = extractvalue [1 x i64] %".19", 0
  %".21" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12", i32 0, i32 6
  %".22" = load [1 x i64], [1 x i64]* %".21"
  %".23" = extractvalue [1 x i64] %".22", 0
  %".24" = icmp slt i64 0, 0
  %".25" = add i64 0, %".20"
  %".26" = select  i1 %".24", i64 %".25", i64 0
  %".27" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12", i32 0, i32 5
  %".28" = getelementptr inbounds [1 x i64], [1 x i64]* %".27", i32 0, i32 0
  %".29" = load i64, i64* %".28", !range !1
  %".30" = insertvalue [1 x i64] undef, i64 %".29", 0
  %".31" = extractvalue [1 x i64] %".30", 0
  %".32" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12", i32 0, i32 6
  %".33" = load [1 x i64], [1 x i64]* %".32"
  %".34" = extractvalue [1 x i64] %".33", 0
  %".35" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".12", i32 0, i32 4
  %".36" = load double*, double** %".35"
  %".37" = mul i64 %".26", 1
  %".38" = add i64 0, %".37"
  %".39" = getelementptr double, double* %".36", i64 %".38"
  %".40" = sitofp i64 0 to double
  store double %".40", double* %".39"
  ret void
}

@"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN8__main__1fB2v1B104cw51cXTLSUwv1sDUaKthoaNgqamjgEhrsDBECh5bcMgDIwAtLdiC0iVEGJLskARQkxJMAsknttXqsABXt1KAWqoOlATyjGs1AQ_3d_3dE5ArrayIdLi1E1C7mutable7alignedE" to i8*)], section "llvm.metadata"
!nvvmir.version = !{ !0 }
!nvvm.annotations = !{ !2 }
!0 = !{ i32 2, i32 0, i32 3, i32 1 }
!1 = !{ i64 0, i64 9223372036854775807 }
!2 = !{ void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN8__main__1fB2v1B104cw51cXTLSUwv1sDUaKthoaNgqamjgEhrsDBECh5bcMgDIwAtLdiC0iVEGJLskARQkxJMAsknttXqsABXt1KAWqoOlATyjGs1AQ_3d_3dE5ArrayIdLi1E1C7mutable7alignedE", !"kernel", i32 1 }
================================================================================

@ZzEeKkAa
Copy link
Contributor Author

It actually reveals few bugs: ValueError: kwarg 'inline' must be one of the strings 'always' or 'never', or it can be a callable that returns True/False. Found value True, if we want to be aligned with numba's jit.

@isVoid
Copy link
Contributor

isVoid commented Mar 31, 2025

Is the difference between the two codes are the unoptimized version contains a call instruction, but the optimized one inlines the function? Can you add a test to make sure this is expected? f.inspect_llvm() should give you the llvm strings.

@ZzEeKkAa
Copy link
Contributor Author

ZzEeKkAa commented Apr 1, 2025

Is the difference between the two codes are the unoptimized version contains a call instruction, but the optimized one inlines the function?

Yes

Can you add a test to make sure this is expected? f.inspect_llvm() should give you the llvm strings.

Thank you for the reference! Just added tests

@gmarkall gmarkall added the 4 - Waiting on author Waiting for author to respond to review label Apr 8, 2025
@gmarkall
Copy link
Contributor

gmarkall commented Apr 8, 2025

This needs the formatting fixing up following the merge of the content of #170.

@ZzEeKkAa ZzEeKkAa force-pushed the yhavrylko/fix/jit_inline branch from 9b96a36 to 21718b1 Compare April 8, 2025 13:28
@ZzEeKkAa
Copy link
Contributor Author

ZzEeKkAa commented Apr 8, 2025

@gmarkall it is ready to go now

@gmarkall gmarkall added 4 - Waiting on reviewer Waiting for reviewer to respond to author and removed 4 - Waiting on author Waiting for author to respond to review labels Apr 11, 2025
@gmarkall
Copy link
Contributor

Unfortunately there is user code that uses inline=True, so we need to still accept that: https://github.com/search?type=code&q=%40cuda.jit%28device%3DTrue%2C+inline%3DTrue%29

I think translating it to "always" and passing it along would be fine. I suspect there was a time in which it worked (maybe a very long time ago) but probably got broken years ago too.

Copy link
Contributor

@gmarkall gmarkall left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other than needing to continue accepting True and False, I think the PR is good.

@gmarkall gmarkall added 4 - Waiting on author Waiting for author to respond to review and removed 4 - Waiting on reviewer Waiting for reviewer to respond to author labels Apr 11, 2025
@ZzEeKkAa
Copy link
Contributor Author

Thank you @gmarkall ! I hope it is okay to support bool and show deprecation warning if user uses it.

@ZzEeKkAa ZzEeKkAa requested a review from gmarkall April 16, 2025 13:48
@gmarkall gmarkall added 4 - Waiting on reviewer Waiting for reviewer to respond to author and removed 4 - Waiting on author Waiting for author to respond to review labels Apr 17, 2025
Copy link
Contributor

@gmarkall gmarkall left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this looks good - just one suggestion to link to the docs then this can be merged,

@gmarkall gmarkall removed the 4 - Waiting on reviewer Waiting for reviewer to respond to author label Apr 30, 2025
@gmarkall gmarkall added the 4 - Waiting on author Waiting for author to respond to review label Apr 30, 2025
Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com>
@ZzEeKkAa ZzEeKkAa requested a review from gmarkall April 30, 2025 21:21
@gmarkall gmarkall added 5 - Ready to merge Testing and reviews complete, ready to merge and removed 4 - Waiting on author Waiting for author to respond to review labels May 1, 2025
@gmarkall gmarkall merged commit 9c727b0 into NVIDIA:main May 1, 2025
35 checks passed
gmarkall added a commit to gmarkall/numba-cuda that referenced this pull request May 2, 2025
- Fix Invalid NVVM IR emitted when lowering shfl_sync APIs (NVIDIA#231)
- Add Bfloat16 Low++ Bindings (NVIDIA#166)
- Fix cuda.jit decorator inline (NVIDIA#181)
- Feature: cuda specific make_attribute_wrapper (NVIDIA#193)
- return a none tuple if no libdevice path is found (NVIDIA#234)
@gmarkall gmarkall mentioned this pull request May 2, 2025
gmarkall added a commit to gmarkall/numba-cuda that referenced this pull request May 3, 2025
- Local variable debug info deduplication (NVIDIA#222)
- Fix package installation for wheels CI  (NVIDIA#238)
- Fix Invalid NVVM IR emitted when lowering shfl_sync APIs (NVIDIA#231)
- Add Bfloat16 Low++ Bindings (NVIDIA#166)
- Fix cuda.jit decorator inline (NVIDIA#181)
- Feature: cuda specific make_attribute_wrapper (NVIDIA#193)
- return a none tuple if no libdevice path is found (NVIDIA#234)
@gmarkall gmarkall mentioned this pull request May 3, 2025
gmarkall added a commit that referenced this pull request May 3, 2025
- Local variable debug info deduplication (#222)
- Fix package installation for wheels CI  (#238)
- Fix Invalid NVVM IR emitted when lowering shfl_sync APIs (#231)
- Add Bfloat16 Low++ Bindings (#166)
- Fix cuda.jit decorator inline (#181)
- Feature: cuda specific make_attribute_wrapper (#193)
- return a none tuple if no libdevice path is found (#234)
gmarkall added a commit to gmarkall/numba-cuda that referenced this pull request May 9, 2025
PR NVIDIA#181 aimed to align the behaviour of the `inline` kwarg with that of
upstream Numba, in that it now forces inlining at the Numba IR level.
It turns out that this kwarg in Numba-CUDA already had the prior effect
of enabling inlining at the NVVM IR level.

Because the default value of `inline` is `"never"`, this was interpreted
by the `compile_cuda()` function as a `True`ish value and every device
function got marked with the `alwaysinline` function attribute. This is
a minor problem in that it probably forces a lot of inlining that we
don't want, but also a major problem in that it triggers an NVVM bug
that was only resolved in CUDA 12.3 that causes a hang in
`nvvmCompileProgram()`.

To rectify these issues, we add the `forceinline` kwarg to the
`@cuda.jit` decorator and the `cuda.compile[_*]()` functions. Now,
`compile_cuda()` will only enable inlining at the NVVM IR level for
`forceinline` and not `inline`. This is aligned with the behaviour of
upstream Numba (see numba/numba#10068). We now document the `inline` and
`forceinline` kwargs to clarify the intent and behaviour for users.

For clarity: the behaviour is now:

- The `inline` kwarg enables inlining only at the Numba IR level.
- The `forceinline` kwarg enables inlining only at the NVVM IR level.
gmarkall added a commit to gmarkall/numba-cuda that referenced this pull request May 9, 2025
PR NVIDIA#181 aimed to align the behaviour of the `inline` kwarg with that of
upstream Numba, in that it now forces inlining at the Numba IR level.
It turns out that this kwarg in Numba-CUDA already had the prior effect
of enabling inlining at the NVVM IR level.

Because the default value of `inline` is `"never"`, this was interpreted
by the `compile_cuda()` function as a `True`ish value and every device
function got marked with the `alwaysinline` function attribute. This is
a minor problem in that it probably forces a lot of inlining that we
don't want, but also a major problem in that it triggers an NVVM bug
that was only resolved in CUDA 12.3 that causes a hang in
`nvvmCompileProgram()`.

To rectify these issues, we add the `forceinline` kwarg to the
`@cuda.jit` decorator and the `cuda.compile[_*]()` functions. Now,
`compile_cuda()` will only enable inlining at the NVVM IR level for
`forceinline` and not `inline`. This is aligned with the behaviour of
upstream Numba (see numba/numba#10068). We now document the `inline` and
`forceinline` kwargs to clarify the intent and behaviour for users.

For clarity: the behaviour is now:

- The `inline` kwarg enables inlining only at the Numba IR level.
- The `forceinline` kwarg enables inlining only at the NVVM IR level.
gmarkall added a commit to gmarkall/numba-cuda that referenced this pull request May 9, 2025
PR NVIDIA#181 aimed to align the behaviour of the `inline` kwarg with that of
upstream Numba, in that it now forces inlining at the Numba IR level.
It turns out that this kwarg in Numba-CUDA already had the prior effect
of enabling inlining at the NVVM IR level.

Because the default value of `inline` is `"never"`, this was interpreted
by the `compile_cuda()` function as a `True`ish value and every device
function got marked with the `alwaysinline` function attribute. This is
a minor problem in that it probably forces a lot of inlining that we
don't want, but also a major problem in that it triggers an NVVM bug
that was only resolved in CUDA 12.3 that causes a hang in
`nvvmCompileProgram()`.

To rectify these issues, we add the `forceinline` kwarg to the
`@cuda.jit` decorator and the `cuda.compile[_*]()` functions. Now,
`compile_cuda()` will only enable inlining at the NVVM IR level for
`forceinline` and not `inline`. This is aligned with the behaviour of
upstream Numba (see numba/numba#10068). We now document the `inline` and
`forceinline` kwargs to clarify the intent and behaviour for users.

For clarity: the behaviour is now:

- The `inline` kwarg enables inlining only at the Numba IR level.
- The `forceinline` kwarg enables inlining only at the NVVM IR level.
gmarkall added a commit that referenced this pull request May 9, 2025
PR #181 aimed to align the behaviour of the `inline` kwarg with that of
upstream Numba, in that it now forces inlining at the Numba IR level.
It turns out that this kwarg in Numba-CUDA already had the prior effect
of enabling inlining at the NVVM IR level.

Because the default value of `inline` is `"never"`, this was interpreted
by the `compile_cuda()` function as a `True`ish value and every device
function got marked with the `alwaysinline` function attribute. This is
a minor problem in that it probably forces a lot of inlining that we
don't want, but also a major problem in that it triggers an NVVM bug
that was only resolved in CUDA 12.3 that causes a hang in
`nvvmCompileProgram()`.

To rectify these issues, we add the `forceinline` kwarg to the
`@cuda.jit` decorator and the `cuda.compile[_*]()` functions. Now,
`compile_cuda()` will only enable inlining at the NVVM IR level for
`forceinline` and not `inline`. This is aligned with the behaviour of
upstream Numba (see numba/numba#10068). We now document the `inline` and
`forceinline` kwargs to clarify the intent and behaviour for users.

For clarity: the behaviour is now:

- The `inline` kwarg enables inlining only at the Numba IR level.
- The `forceinline` kwarg enables inlining only at the NVVM IR level.
gmarkall added a commit that referenced this pull request May 9, 2025
PR #181 aimed to align the behaviour of the `inline` kwarg with that of
upstream Numba, in that it now forces inlining at the Numba IR level.
It turns out that this kwarg in Numba-CUDA already had the prior effect
of enabling inlining at the NVVM IR level.

Because the default value of `inline` is `"never"`, this was interpreted
by the `compile_cuda()` function as a `True`ish value and every device
function got marked with the `alwaysinline` function attribute. This is
a minor problem in that it probably forces a lot of inlining that we
don't want, but also a major problem in that it triggers an NVVM bug
that was only resolved in CUDA 12.3 that causes a hang in
`nvvmCompileProgram()`.

To rectify these issues, we add the `forceinline` kwarg to the
`@cuda.jit` decorator and the `cuda.compile[_*]()` functions. Now,
`compile_cuda()` will only enable inlining at the NVVM IR level for
`forceinline` and not `inline`. This is aligned with the behaviour of
upstream Numba (see numba/numba#10068). We now document the `inline` and
`forceinline` kwargs to clarify the intent and behaviour for users.

For clarity: the behaviour is now:

- The `inline` kwarg enables inlining only at the Numba IR level.
- The `forceinline` kwarg enables inlining only at the NVVM IR level.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

5 - Ready to merge Testing and reviews complete, ready to merge

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants