diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py index 3cab447c3..bb5ce81c9 100644 --- a/numba_cuda/numba/cuda/compiler.py +++ b/numba_cuda/numba/cuda/compiler.py @@ -278,7 +278,7 @@ def compile_cuda( args, debug=False, lineinfo=False, - inline=False, + forceinline=False, fastmath=False, nvvm_options=None, cc=None, @@ -316,7 +316,7 @@ def compile_cuda( else: flags.error_model = "numpy" - if inline: + if forceinline: flags.forceinline = True if fastmath: flags.fastmath = True @@ -574,6 +574,7 @@ def compile( abi="c", abi_info=None, output="ptx", + forceinline=False, ): """Compile a Python function to PTX or LTO-IR for a given set of argument types. @@ -614,6 +615,11 @@ def compile( :type abi_info: dict :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``. :type output: str + :param forceinline: Enables inlining at the NVVM IR level when set to + ``True``. This is accomplished by adding the + ``alwaysinline`` function attribute to the function + definition. This is only valid when the output is + ``"ltoir"``. :return: (code, resty): The compiled code and inferred return type :rtype: tuple """ @@ -626,6 +632,12 @@ def compile( if output not in ("ptx", "ltoir"): raise NotImplementedError(f"Unsupported output type: {output}") + if forceinline and not device: + raise ValueError("Cannot force-inline kernels") + + if forceinline and output != "ltoir": + raise ValueError("Can only designate forced inlining in LTO-IR") + debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug opt = (config.OPT != 0) if opt is None else opt @@ -660,6 +672,7 @@ def compile( fastmath=fastmath, nvvm_options=nvvm_options, cc=cc, + forceinline=forceinline, ) resty = cres.signature.return_type @@ -699,6 +712,7 @@ def compile_for_current_device( abi="c", abi_info=None, output="ptx", + forceinline=False, ): """Compile a Python function to PTX or LTO-IR for a given signature for the current device's compute capabilility. This calls :func:`compile` with an @@ -716,6 +730,7 @@ def compile_for_current_device( abi=abi, abi_info=abi_info, output=output, + forceinline=forceinline, ) @@ -730,6 +745,7 @@ def compile_ptx( opt=None, abi="numba", abi_info=None, + forceinline=False, ): """Compile a Python function to PTX for a given signature. See :func:`compile`. The defaults for this function are to compile a kernel @@ -747,6 +763,7 @@ def compile_ptx( abi=abi, abi_info=abi_info, output="ptx", + forceinline=forceinline, ) @@ -760,6 +777,7 @@ def compile_ptx_for_current_device( opt=None, abi="numba", abi_info=None, + forceinline=False, ): """Compile a Python function to PTX for a given signature for the current device's compute capabilility. See :func:`compile_ptx`.""" @@ -775,6 +793,7 @@ def compile_ptx_for_current_device( opt=opt, abi=abi, abi_info=abi_info, + forceinline=forceinline, ) diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py index d5a0a29b3..58f015a9f 100644 --- a/numba_cuda/numba/cuda/decorators.py +++ b/numba_cuda/numba/cuda/decorators.py @@ -17,6 +17,7 @@ def jit( func_or_sig=None, device=False, inline="never", + forceinline=False, link=[], debug=None, opt=None, @@ -39,6 +40,14 @@ def jit( .. note:: A kernel cannot have any return value. :param device: Indicates whether this is a device function. :type device: bool + :param inline: Enables inlining at the Numba IR level when set to + ``"always"``. See `Notes on Inlining + `_. + :type inline: str + :param forceinline: Enables inlining at the NVVM IR level when set to + ``True``. This is accomplished by adding the ``alwaysinline`` function + attribute to the function definition. + :type forceinline: bool :param link: A list of files containing PTX or CUDA C/C++ source to link with the function :type link: list @@ -85,7 +94,9 @@ def jit( DeprecationWarning( "Passing bool to inline argument is deprecated, please refer to " "Numba's documentation on inlining: " - "https://numba.readthedocs.io/en/stable/developer/inlining.html" + "https://numba.readthedocs.io/en/stable/developer/inlining.html. " + "You may have wanted the forceinline argument instead, to force " + "inlining at the NVVM IR level." ) inline = "always" if inline else "never" @@ -140,6 +151,7 @@ def _jit(func): targetoptions["fastmath"] = fastmath targetoptions["device"] = device targetoptions["inline"] = inline + targetoptions["forceinline"] = forceinline targetoptions["extensions"] = extensions disp = CUDADispatcher(func, targetoptions=targetoptions) @@ -182,6 +194,7 @@ def autojitwrapper(func): func, device=device, inline=inline, + forceinline=forceinline, debug=debug, opt=opt, lineinfo=lineinfo, @@ -206,6 +219,7 @@ def autojitwrapper(func): targetoptions["fastmath"] = fastmath targetoptions["device"] = device targetoptions["inline"] = inline + targetoptions["forceinline"] = forceinline targetoptions["extensions"] = extensions disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 72344aa22..d0b14f7b5 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -137,6 +137,7 @@ def __init__( debug=False, lineinfo=False, inline=False, + forceinline=False, fastmath=False, extensions=None, max_registers=None, @@ -182,7 +183,7 @@ def __init__( self.argtypes, debug=self.debug, lineinfo=lineinfo, - inline=inline, + forceinline=forceinline, fastmath=fastmath, nvvm_options=nvvm_options, cc=cc, @@ -1073,7 +1074,7 @@ def compile_device(self, args, return_type=None): with self._compiling_counter: debug = self.targetoptions.get("debug") lineinfo = self.targetoptions.get("lineinfo") - inline = self.targetoptions.get("inline") + forceinline = self.targetoptions.get("forceinline") fastmath = self.targetoptions.get("fastmath") nvvm_options = { @@ -1091,7 +1092,7 @@ def compile_device(self, args, return_type=None): args, debug=debug, lineinfo=lineinfo, - inline=inline, + forceinline=forceinline, fastmath=fastmath, nvvm_options=nvvm_options, cc=cc, diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_inline.py b/numba_cuda/numba/cuda/tests/cudapy/test_inline.py index d969d05bc..2708b2c16 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_inline.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_inline.py @@ -8,8 +8,8 @@ ) +@skip_on_cudasim("Cudasim does not support inline and forceinline") class TestCudaInline(CUDATestCase): - @skip_on_cudasim("Cudasim does not support inline") def _test_call_inline(self, inline): """Test @cuda.jit(inline=...)""" a = np.ones(2, dtype=np.int32) @@ -42,6 +42,9 @@ def call_set_zero(a): # check that call was not inlined self.assertIsNotNone(match, msg=llvm_ir) + # alwaysinline should not be in the IR when the inline kwarg is used + self.assertNotIn("alwaysinline", llvm_ir) + def test_call_inline_always(self): self._test_call_inline("always") @@ -54,6 +57,100 @@ def test_call_inline_true(self): def test_call_inline_false(self): self._test_call_inline(False) + def _test_call_forceinline(self, forceinline): + """Test @cuda.jit(forceinline=...)""" + a = np.ones(2, dtype=np.int32) + + sig = (types.int32[::1],) + + @cuda.jit(forceinline=forceinline) + def set_zero(a): + a[0] = 0 + + @cuda.jit(sig) + def call_set_zero(a): + set_zero(a) + + call_set_zero[1, 2](a) + + expected = np.arange(2, dtype=np.int32) + self.assertTrue(np.all(a == expected)) + + llvm_ir = call_set_zero.inspect_llvm(sig) + pat = r"call [a-zA-Z0-9]* @" + match = re.compile(pat).search(llvm_ir) + + # Check that call was not inlined at the Numba IR level - the call + # should still be present in the IR + self.assertIsNotNone(match) + + # Check the definition of set_zero - it is a definition where the + # name does not include an underscore just before "set_zero", because + # that would match the "call_set_zero" definition + pat = r"define.*[^_]set_zero.*" + match = re.compile(pat).search(llvm_ir) + self.assertIsNotNone(match) + if forceinline: + self.assertIn("alwaysinline", match.group()) + else: + self.assertNotIn("alwaysinline", match.group()) + + # The kernel, "call_set_zero", should never have "alwaysinline" set + pat = r"define.*call_set_zero.*" + match = re.compile(pat).search(llvm_ir) + self.assertIsNotNone(match) + self.assertNotIn("alwaysinline", match.group()) + + def test_call_forceinline_true(self): + self._test_call_forceinline(True) + + def test_call_forceinline_false(self): + self._test_call_forceinline(False) + + def test_compile_forceinline_ltoir_only(self): + def set_zero(a): + a[0] = 0 + + args = (types.float32[::1],) + msg = r"Can only designate forced inlining in LTO-IR" + with self.assertRaisesRegex(ValueError, msg): + cuda.compile( + set_zero, + args, + device=True, + forceinline=True, + ) + + def _compile_set_zero(self, forceinline): + def set_zero(a): + a[0] = 0 + + args = (types.float32[::1],) + ltoir, resty = cuda.compile( + set_zero, + args, + device=True, + output="ltoir", + forceinline=forceinline, + ) + + # Sanity check + self.assertEqual(resty, types.none) + + return ltoir + + def test_compile_forceinline(self): + ltoir_noinline = self._compile_set_zero(False) + ltoir_forceinline = self._compile_set_zero(True) + + # As LTO-IR is opaque, the best we can do is check that changing the + # flag resulted in a change in the generated LTO-IR in some way. + self.assertNotEqual( + ltoir_noinline, + ltoir_forceinline, + "forceinline flag appeared to have no effect on LTO-IR", + ) + if __name__ == "__main__": unittest.main()