NVIDIA · gmarkall · Oct 2, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/docs/source/reference/host.rst b/docs/source/reference/host.rst
@@ -125,6 +125,8 @@ any of the driver API. This can be useful for:
 
 .. autofunction:: numba.cuda.compile
 
+.. autofunction:: numba.cuda.compile_all
+
 
 The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control
 the default compute capability targeted by ``compile`` - see

diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
@@ -78,6 +78,7 @@
     compile_for_current_device,
     compile_ptx,
     compile_ptx_for_current_device,
+    compile_all,
 )
 
 # This is the out-of-tree NVIDIA-maintained target. This is reported in Numba

diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -471,6 +471,7 @@ def _reduce_states(self):
             needs_cudadevrt=self.needs_cudadevrt,
             nrt=nrt,
             use_cooperative=self.use_cooperative,
+            lto=self._lto,
         )
 
     @classmethod
@@ -488,6 +489,7 @@ def _rebuild(
         needs_cudadevrt,
         nrt,
         use_cooperative,
+        lto,
     ):
         """
         Rebuild an instance.
@@ -508,6 +510,7 @@ def _rebuild(
         if nrt:
             instance._linking_files = {NRT_LIBRARY}
 
+        instance._lto = lto
         return instance
 
 

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
@@ -28,6 +28,7 @@
     funcdesc,
 )
 from numba.cuda.cudadrv import nvvm, nvrtc
+from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.flags import CUDAFlags
 from numba.cuda.target import CUDACABICallConv
@@ -977,6 +978,175 @@ def define_error_gv(postfix):
     return helper_func
 
 
+def compile_all(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ltoir",
+    forceinline=False,
+    launch_bounds=None,
+):
+    """Similar to ``compile()``, but returns a list of PTX codes/LTO-IRs for
+    the compiled function and the external functions it depends on.
+    If external functions are CUDA C++ source, they will be compiled with
+    NVRTC. Other kinds of external function code (e.g. cubins, fatbins, etc.)
+    will be added directly to the return list. The output code kind is
+    determined by the ``output`` parameter, and defaults to ``"ltoir"``.
+    """
+
+    if output not in ("ptx", "ltoir"):
+        raise NotImplementedError(f"Unsupported output type: {output}")
+
+    if forceinline and output != "ltoir":
+        raise ValueError("Can only designate forced inlining in LTO-IR")
+
+    lto = output == "ltoir"
+
+    cc = _default_cc(cc)
+
+    lib, resty = _compile_pyfunc_with_fixup(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        forceinline=forceinline,
+        launch_bounds=launch_bounds,
+        lto=lto,
+    )
+
+    if lto:
+        code = lib.get_ltoir(cc=cc)
+    else:
+        code = lib.get_asm_str(cc=cc)
+    codes = [code]
+
+    # linking_files
+    is_ltoir = output == "ltoir"
+    for path_or_obj in lib._linking_files:
+        obj = LinkableCode.from_path_or_obj(path_or_obj)
+        if obj.kind == "cu":
+            code, log = nvrtc.compile(
+                obj.data,
+                obj.name,
+                cc,
+                ltoir=is_ltoir,
+                lineinfo=lineinfo,
+                debug=debug,
+            )
+            codes.append(code)
+        else:
+            codes.append(obj)
+
+    return codes, resty
+
+
+def _compile_pyfunc_with_fixup(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    forceinline=False,
+    launch_bounds=None,
+    lto=False,
+):
+    """Internal method to compile a python function and perform post-processing
+
+    - If pyfunc is a kernel, post-processing includes kernel fixup and setting
+    launch bounds.
+    - If pyfunc is a device function, post-processing includes ABI wrapper.
+
+    `lto` means that all internal pipeline options use LTO.
+
+    Returns the code library and return type.
+    """
+    if abi not in ("numba", "c"):
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
+
+    if abi == "c" and not device:
+        raise NotImplementedError("The C ABI is not supported for kernels")
+
+    if forceinline and not device:
+        raise ValueError("Cannot force-inline kernels")
+
+    debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
+    opt = (config.OPT != 0) if opt is None else opt
+
+    if debug and opt:
+        msg = (
+            "debug=True with opt=True "
+            "is not supported by CUDA. This may result in a crash"
+            " - set debug=False or opt=False."
+        )
+        warn(NumbaInvalidConfigWarning(msg))
+
+    abi_info = abi_info or dict()
+
+    nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
+
+    if debug:
+        nvvm_options["g"] = None
+
+    if lto:
+        nvvm_options["gen-lto"] = None
+
+    args, return_type = sigutils.normalize_signature(sig)
+
+    cc = _default_cc(cc)
+
+    cres = compile_cuda(
+        pyfunc,
+        return_type,
+        args,
+        debug=debug,
+        lineinfo=lineinfo,
+        fastmath=fastmath,
+        nvvm_options=nvvm_options,
+        cc=cc,
+        forceinline=forceinline,
+    )
+    resty = cres.signature.return_type
+
+    if resty and not device and resty != types.void:
+        raise TypeError("CUDA kernel must have void return type.")
+
+    tgt = cres.target_context
+
+    if device:
+        lib = cres.library
+        if abi == "c":
+            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
+            lib = cabi_wrap_function(
+                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
+            )
+    else:
+        lib = cres.library
+        kernel = lib.get_function(cres.fndesc.llvm_func_name)
+        lib._entry_name = cres.fndesc.llvm_func_name
+        kernel_fixup(kernel, debug)
+        nvvm.set_launch_bounds(kernel, launch_bounds)
+
+    return lib, resty
+
+
 @global_compiler_lock
 def compile(
     pyfunc,
@@ -1050,82 +1220,28 @@ def compile(
     :return: (code, resty): The compiled code and inferred return type
     :rtype: tuple
     """
-    if abi not in ("numba", "c"):
-        raise NotImplementedError(f"Unsupported ABI: {abi}")
-
-    if abi == "c" and not device:
-        raise NotImplementedError("The C ABI is not supported for kernels")
-
     if output not in ("ptx", "ltoir"):
         raise NotImplementedError(f"Unsupported output type: {output}")
 
-    if forceinline and not device:
-        raise ValueError("Cannot force-inline kernels")
-
     if forceinline and output != "ltoir":
         raise ValueError("Can only designate forced inlining in LTO-IR")
 
-    debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
-    opt = (config.OPT != 0) if opt is None else opt
-
-    if debug and opt:
-        msg = (
-            "debug=True with opt=True "
-            "is not supported by CUDA. This may result in a crash"
-            " - set debug=False or opt=False."
-        )
-        warn(NumbaInvalidConfigWarning(msg))
-
     lto = output == "ltoir"
-    abi_info = abi_info or dict()
-
-    nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
-
-    if debug:
-        nvvm_options["g"] = None
-
-    if lto:
-        nvvm_options["gen-lto"] = None
-
-    args, return_type = sigutils.normalize_signature(sig)
-
-    # If the user has used the config variable to specify a non-default that is
-    # greater than the lowest non-deprecated one, then we should default to
-    # their specified CC instead of the lowest non-deprecated one.
-    MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())
-    cc = cc or MIN_CC
-
-    cres = compile_cuda(
+    lib, resty = _compile_pyfunc_with_fixup(
         pyfunc,
-        return_type,
-        args,
+        sig,
         debug=debug,
         lineinfo=lineinfo,
+        device=device,
         fastmath=fastmath,
-        nvvm_options=nvvm_options,
         cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
         forceinline=forceinline,
+        launch_bounds=launch_bounds,
+        lto=lto,
     )
-    resty = cres.signature.return_type
-
-    if resty and not device and resty != types.void:
-        raise TypeError("CUDA kernel must have void return type.")
-
-    tgt = cres.target_context
-
-    if device:
-        lib = cres.library
-        if abi == "c":
-            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
-            lib = cabi_wrap_function(
-                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
-            )
-    else:
-        lib = cres.library
-        kernel = lib.get_function(cres.fndesc.llvm_func_name)
-        lib._entry_name = cres.fndesc.llvm_func_name
-        kernel_fixup(kernel, debug)
-        nvvm.set_launch_bounds(kernel, launch_bounds)
 
     if lto:
         code = lib.get_ltoir(cc=cc)
@@ -1272,3 +1388,14 @@ class ExternFunction:
     def __init__(self, name, sig):
         self.name = name
         self.sig = sig
+
+
+def _default_cc(cc):
+    """
+    Return default compute capability based on config and nvrtc lowest supported cc.
+
+    If user specifies a cc, return that.
+    """
+    if cc:
+        return cc
+    return max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())