From 8c1cd8be8ccdba674abc5d614446d96ba8e2c98d Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 25 Jun 2025 14:19:39 +0100 Subject: [PATCH 1/2] Enable LTO by default when pynvjitlink is available Enabling LTO by default when pynvjitlink is available should: - Provide a general improvement in performance for various use cases, particularly those linking external code. This ought to be benchmarked, but I'm making an assumption that it helps for now based on prior anecdotal / informal experience. - Make the case where users link LTO-IR to kernels or as part of device function declarations "just work" as long as pynvjitlink is installed. A further improvement would still be to error out when a users tries to link LTO-IR when pynvjitlink is not installed - that is left to be done in a future PR. --- numba_cuda/numba/cuda/decorators.py | 14 ++++++++++++++ numba_cuda/numba/cuda/tests/cudapy/test_errors.py | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py index 9b7fd4c72..704d7c9f1 100644 --- a/numba_cuda/numba/cuda/decorators.py +++ b/numba_cuda/numba/cuda/decorators.py @@ -24,6 +24,7 @@ def jit( lineinfo=False, cache=False, launch_bounds=None, + lto=None, **kws, ): """ @@ -83,6 +84,10 @@ def jit( If a scalar is provided, it is used as the maximum number of threads per block. :type launch_bounds: int | tuple[int] + :param lto: Whether to enable LTO. If unspecified, LTO is enabled by + default when pynvjitlink is available, except for kernels where + ``debug=True``. + :type lto: bool """ if link and config.ENABLE_CUDASIM: @@ -136,6 +141,13 @@ def jit( if device and kws.get("link"): raise ValueError("link keyword invalid for device function") + if lto is None: + # Default to using LTO if pynvjitlink is available and we're not debugging + lto = config.CUDA_ENABLE_PYNVJITLINK and not debug + else: + if lto and not config.CUDA_ENABLE_PYNVJITLINK: + raise RuntimeError("LTO requires pynvjitlink, which is not enabled") + if sigutils.is_signature(func_or_sig): signatures = [func_or_sig] specialized = True @@ -165,6 +177,7 @@ def _jit(func): targetoptions["forceinline"] = forceinline targetoptions["extensions"] = extensions targetoptions["launch_bounds"] = launch_bounds + targetoptions["lto"] = lto disp = CUDADispatcher(func, targetoptions=targetoptions) @@ -235,6 +248,7 @@ def autojitwrapper(func): targetoptions["forceinline"] = forceinline targetoptions["extensions"] = extensions targetoptions["launch_bounds"] = launch_bounds + targetoptions["lto"] = lto disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions) if cache: diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py index 0b24bee8e..befe4b42e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py @@ -1,6 +1,7 @@ from numba import cuda from numba.core.errors import TypingError from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim +from numba.tests.support import override_config def noop(x): @@ -89,6 +90,16 @@ def kernel_func(): self.assertIn("resolving callee type: type(CUDADispatcher", excstr) self.assertIn("NameError: name 'floor' is not defined", excstr) + def test_lto_without_pynvjitlink_error(self): + with self.assertRaisesRegex(RuntimeError, "LTO requires pynvjitlink"): + with override_config("CUDA_ENABLE_PYNVJITLINK", False): + + @cuda.jit(lto=True) + def f(): + pass + + f[1, 1]() + if __name__ == "__main__": unittest.main() From bb8ce7a80741c43f098ea4d760fb65b34eca57ec Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 26 Jun 2025 10:59:24 +0100 Subject: [PATCH 2/2] Update tests to match LTO-by-default message Also skip an irrelevant test on cudasim. --- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py | 4 ++-- numba_cuda/numba/cuda/tests/cudapy/test_errors.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 8cd33bb9a..8dca18901 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -304,7 +304,7 @@ def test_linker_disabled_envvar(self): env = os.environ.copy() env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0" with self.assertRaisesRegex( - AssertionError, "LTO and additional flags require PyNvJitLinker" + AssertionError, "LTO requires pynvjitlink, which is not enabled" ): # Actual error raised is `ValueError`, but `run_in_subprocess` # reraises as AssertionError. @@ -323,7 +323,7 @@ def test_linker_disabled_config(self): env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None) with override_config("CUDA_ENABLE_PYNVJITLINK", False): with self.assertRaisesRegex( - AssertionError, "LTO and additional flags require PyNvJitLinker" + AssertionError, "LTO requires pynvjitlink, which is not enabled" ): run_in_subprocess( self.src.format( diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py index befe4b42e..c663da0a6 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py @@ -90,6 +90,7 @@ def kernel_func(): self.assertIn("resolving callee type: type(CUDADispatcher", excstr) self.assertIn("NameError: name 'floor' is not defined", excstr) + @skip_on_cudasim("Simulator does not use pynvjitlink") def test_lto_without_pynvjitlink_error(self): with self.assertRaisesRegex(RuntimeError, "LTO requires pynvjitlink"): with override_config("CUDA_ENABLE_PYNVJITLINK", False):