From 8c1cd8be8ccdba674abc5d614446d96ba8e2c98d Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 25 Jun 2025 14:19:39 +0100
Subject: [PATCH 1/2] Enable LTO by default when pynvjitlink is available

Enabling LTO by default when pynvjitlink is available should:

- Provide a general improvement in performance for various use cases,
  particularly those linking external code. This ought to be
  benchmarked, but I'm making an assumption that it helps for now based
  on prior anecdotal / informal experience.
- Make the case where users link LTO-IR to kernels or as part of device
  function declarations "just work" as long as pynvjitlink is installed.

A further improvement would still be to error out when a users tries to
link LTO-IR when pynvjitlink is not installed - that is left to be done
in a future PR.
---
 numba_cuda/numba/cuda/decorators.py               | 14 ++++++++++++++
 numba_cuda/numba/cuda/tests/cudapy/test_errors.py | 11 +++++++++++
 2 files changed, 25 insertions(+)

diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py
index 9b7fd4c72..704d7c9f1 100644
--- a/numba_cuda/numba/cuda/decorators.py
+++ b/numba_cuda/numba/cuda/decorators.py
@@ -24,6 +24,7 @@ def jit(
     lineinfo=False,
     cache=False,
     launch_bounds=None,
+    lto=None,
     **kws,
 ):
     """
@@ -83,6 +84,10 @@ def jit(
                           If a scalar is provided, it is used as the maximum
                           number of threads per block.
     :type launch_bounds: int | tuple[int]
+    :param lto: Whether to enable LTO. If unspecified, LTO is enabled by
+                default when pynvjitlink is available, except for kernels where
+                ``debug=True``.
+    :type lto: bool
     """
 
     if link and config.ENABLE_CUDASIM:
@@ -136,6 +141,13 @@ def jit(
     if device and kws.get("link"):
         raise ValueError("link keyword invalid for device function")
 
+    if lto is None:
+        # Default to using LTO if pynvjitlink is available and we're not debugging
+        lto = config.CUDA_ENABLE_PYNVJITLINK and not debug
+    else:
+        if lto and not config.CUDA_ENABLE_PYNVJITLINK:
+            raise RuntimeError("LTO requires pynvjitlink, which is not enabled")
+
     if sigutils.is_signature(func_or_sig):
         signatures = [func_or_sig]
         specialized = True
@@ -165,6 +177,7 @@ def _jit(func):
             targetoptions["forceinline"] = forceinline
             targetoptions["extensions"] = extensions
             targetoptions["launch_bounds"] = launch_bounds
+            targetoptions["lto"] = lto
 
             disp = CUDADispatcher(func, targetoptions=targetoptions)
 
@@ -235,6 +248,7 @@ def autojitwrapper(func):
                 targetoptions["forceinline"] = forceinline
                 targetoptions["extensions"] = extensions
                 targetoptions["launch_bounds"] = launch_bounds
+                targetoptions["lto"] = lto
                 disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
 
                 if cache:
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
index 0b24bee8e..befe4b42e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
@@ -1,6 +1,7 @@
 from numba import cuda
 from numba.core.errors import TypingError
 from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+from numba.tests.support import override_config
 
 
 def noop(x):
@@ -89,6 +90,16 @@ def kernel_func():
         self.assertIn("resolving callee type: type(CUDADispatcher", excstr)
         self.assertIn("NameError: name 'floor' is not defined", excstr)
 
+    def test_lto_without_pynvjitlink_error(self):
+        with self.assertRaisesRegex(RuntimeError, "LTO requires pynvjitlink"):
+            with override_config("CUDA_ENABLE_PYNVJITLINK", False):
+
+                @cuda.jit(lto=True)
+                def f():
+                    pass
+
+                f[1, 1]()
+
 
 if __name__ == "__main__":
     unittest.main()

From bb8ce7a80741c43f098ea4d760fb65b34eca57ec Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 26 Jun 2025 10:59:24 +0100
Subject: [PATCH 2/2] Update tests to match LTO-by-default message

Also skip an irrelevant test on cudasim.
---
 numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py | 4 ++--
 numba_cuda/numba/cuda/tests/cudapy/test_errors.py     | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 8cd33bb9a..8dca18901 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -304,7 +304,7 @@ def test_linker_disabled_envvar(self):
         env = os.environ.copy()
         env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0"
         with self.assertRaisesRegex(
-            AssertionError, "LTO and additional flags require PyNvJitLinker"
+            AssertionError, "LTO requires pynvjitlink, which is not enabled"
         ):
             # Actual error raised is `ValueError`, but `run_in_subprocess`
             # reraises as AssertionError.
@@ -323,7 +323,7 @@ def test_linker_disabled_config(self):
         env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
         with override_config("CUDA_ENABLE_PYNVJITLINK", False):
             with self.assertRaisesRegex(
-                AssertionError, "LTO and additional flags require PyNvJitLinker"
+                AssertionError, "LTO requires pynvjitlink, which is not enabled"
             ):
                 run_in_subprocess(
                     self.src.format(
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
index befe4b42e..c663da0a6 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
@@ -90,6 +90,7 @@ def kernel_func():
         self.assertIn("resolving callee type: type(CUDADispatcher", excstr)
         self.assertIn("NameError: name 'floor' is not defined", excstr)
 
+    @skip_on_cudasim("Simulator does not use pynvjitlink")
     def test_lto_without_pynvjitlink_error(self):
         with self.assertRaisesRegex(RuntimeError, "LTO requires pynvjitlink"):
             with override_config("CUDA_ENABLE_PYNVJITLINK", False):