NVIDIA · gmarkall · May 1, 2025 · Mar 31, 2025 · Mar 31, 2025 · Apr 1, 2025
diff --git a/docs/source/user/ufunc.rst b/docs/source/user/ufunc.rst
@@ -46,7 +46,7 @@ All CUDA ufunc kernels have the ability to call other CUDA device functions::
     from numba import vectorize, cuda
 
     # define a device function
-    @cuda.jit('float32(float32, float32, float32)', device=True, inline=True)
+    @cuda.jit('float32(float32, float32, float32)', device=True, inline="always")
     def cu_device_fn(x, y, z):
         return x ** y / z
 

diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py
@@ -16,7 +16,7 @@
 def jit(
     func_or_sig=None,
     device=False,
-    inline=False,
+    inline="never",
     link=[],
     debug=None,
     opt=None,
@@ -81,6 +81,15 @@ def jit(
         msg = _msg_deprecated_signature_arg.format("bind")
         raise DeprecationError(msg)
 
+    if isinstance(inline, bool):
+        DeprecationWarning(
+            "Passing bool to inline argument is deprecated, please refer to "
+            "Numba's documentation on inlining: "
+            "https://numba.readthedocs.io/en/stable/developer/inlining.html"
+        )
+
+        inline = "always" if inline else "never"
+
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
     fastmath = kws.get("fastmath", False)
@@ -130,6 +139,7 @@ def _jit(func):
             targetoptions["opt"] = opt
             targetoptions["fastmath"] = fastmath
             targetoptions["device"] = device
+            targetoptions["inline"] = inline
             targetoptions["extensions"] = extensions
 
             disp = CUDADispatcher(func, targetoptions=targetoptions)
@@ -171,6 +181,7 @@ def autojitwrapper(func):
                     return jit(
                         func,
                         device=device,
+                        inline=inline,
                         debug=debug,
                         opt=opt,
                         lineinfo=lineinfo,
@@ -194,6 +205,7 @@ def autojitwrapper(func):
                 targetoptions["link"] = link
                 targetoptions["fastmath"] = fastmath
                 targetoptions["device"] = device
+                targetoptions["inline"] = inline
                 targetoptions["extensions"] = extensions
                 disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
 

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py
@@ -7,7 +7,7 @@
 
 class TestCudaArrayArg(CUDATestCase):
     def test_array_ary(self):
-        @cuda.jit("double(double[:],int64)", device=True, inline=True)
+        @cuda.jit("double(double[:],int64)", device=True, inline="always")
         def device_function(a, c):
             return a[c]
 

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py b/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py
@@ -81,7 +81,7 @@ def test_blackscholes(self):
                 VOLATILITY,
             )
 
-        @cuda.jit(double(double), device=True, inline=True)
+        @cuda.jit(double(double), device=True, inline="always")
         def cnd_cuda(d):
             K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
             ret_val = (

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_inline.py b/numba_cuda/numba/cuda/tests/cudapy/test_inline.py
@@ -0,0 +1,59 @@
+import re
+import numpy as np
+from numba import cuda, types
+from numba.cuda.testing import (
+    unittest,
+    CUDATestCase,
+    skip_on_cudasim,
+)
+
+
+class TestCudaInline(CUDATestCase):
+    @skip_on_cudasim("Cudasim does not support inline")
+    def _test_call_inline(self, inline):
+        """Test @cuda.jit(inline=...)"""
+        a = np.ones(2, dtype=np.int32)
+
+        sig = (types.int32[::1],)
+
+        @cuda.jit(inline=inline)
+        def set_zero(a):
+            a[0] = 0
+
+        @cuda.jit(sig)
+        def call_set_zero(a):
+            set_zero(a)
+
+        call_set_zero[1, 2](a)
+
+        expected = np.arange(2, dtype=np.int32)
+        self.assertTrue(np.all(a == expected))
+
+        llvm_ir = call_set_zero.inspect_llvm(sig)
+        pat = r"call [a-zA-Z0-9]* @"
+        match = re.compile(pat).search(llvm_ir)
+
+        if inline == "always" or inline is True:
+            # check that call was inlined
+            self.assertIsNone(match, msg=llvm_ir)
+        else:
+            assert inline == "never" or inline is False
+
+            # check that call was not inlined
+            self.assertIsNotNone(match, msg=llvm_ir)
+
+    def test_call_inline_always(self):
+        self._test_call_inline("always")
+
+    def test_call_inline_never(self):
+        self._test_call_inline("never")
+
+    def test_call_inline_true(self):
+        self._test_call_inline(True)
+
+    def test_call_inline_false(self):
+        self._test_call_inline(False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py b/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py
@@ -14,7 +14,7 @@
 
 class TestCudaLaplace(CUDATestCase):
     def test_laplace_small(self):
-        @cuda.jit(float64(float64, float64), device=True, inline=True)
+        @cuda.jit(float64(float64, float64), device=True, inline="always")
         def get_max(a, b):
             if a > b:
                 return a

diff --git a/numba_cuda/numba/cuda/vectorizers.py b/numba_cuda/numba/cuda/vectorizers.py
@@ -206,7 +206,7 @@ def __vectorized_{name}({args}, __out__):
 
 class CUDAVectorize(deviceufunc.DeviceVectorize):
     def _compile_core(self, sig):
-        cudevfn = cuda.jit(sig, device=True, inline=True)(self.pyfunc)
+        cudevfn = cuda.jit(sig, device=True, inline="always")(self.pyfunc)
         return cudevfn, cudevfn.overloads[sig.args].signature.return_type
 
     def _get_globals(self, corefn):