diff --git a/docs/source/reference/host.rst b/docs/source/reference/host.rst
index 64bebd2d9..dbfd06d66 100644
--- a/docs/source/reference/host.rst
+++ b/docs/source/reference/host.rst
@@ -125,6 +125,8 @@ any of the driver API. This can be useful for:
 
 .. autofunction:: numba.cuda.compile
 
+.. autofunction:: numba.cuda.compile_all
+
 
 The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control
 the default compute capability targeted by ``compile`` - see
diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index 2c82b70d8..728faa355 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -78,6 +78,7 @@
     compile_for_current_device,
     compile_ptx,
     compile_ptx_for_current_device,
+    compile_all,
 )
 
 # This is the out-of-tree NVIDIA-maintained target. This is reported in Numba
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
index 134311b07..c40253c16 100644
--- a/numba_cuda/numba/cuda/codegen.py
+++ b/numba_cuda/numba/cuda/codegen.py
@@ -471,6 +471,7 @@ def _reduce_states(self):
             needs_cudadevrt=self.needs_cudadevrt,
             nrt=nrt,
             use_cooperative=self.use_cooperative,
+            lto=self._lto,
         )
 
     @classmethod
@@ -488,6 +489,7 @@ def _rebuild(
         needs_cudadevrt,
         nrt,
         use_cooperative,
+        lto,
     ):
         """
         Rebuild an instance.
@@ -508,6 +510,7 @@ def _rebuild(
         if nrt:
             instance._linking_files = {NRT_LIBRARY}
 
+        instance._lto = lto
         return instance
 
 
diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
index 40ff6ab00..bb6cd1c67 100644
--- a/numba_cuda/numba/cuda/compiler.py
+++ b/numba_cuda/numba/cuda/compiler.py
@@ -28,6 +28,7 @@
     funcdesc,
 )
 from numba.cuda.cudadrv import nvvm, nvrtc
+from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.flags import CUDAFlags
 from numba.cuda.target import CUDACABICallConv
@@ -977,6 +978,175 @@ def define_error_gv(postfix):
     return helper_func
 
 
+def compile_all(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ltoir",
+    forceinline=False,
+    launch_bounds=None,
+):
+    """Similar to ``compile()``, but returns a list of PTX codes/LTO-IRs for
+    the compiled function and the external functions it depends on.
+    If external functions are CUDA C++ source, they will be compiled with
+    NVRTC. Other kinds of external function code (e.g. cubins, fatbins, etc.)
+    will be added directly to the return list. The output code kind is
+    determined by the ``output`` parameter, and defaults to ``"ltoir"``.
+    """
+
+    if output not in ("ptx", "ltoir"):
+        raise NotImplementedError(f"Unsupported output type: {output}")
+
+    if forceinline and output != "ltoir":
+        raise ValueError("Can only designate forced inlining in LTO-IR")
+
+    lto = output == "ltoir"
+
+    cc = _default_cc(cc)
+
+    lib, resty = _compile_pyfunc_with_fixup(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        forceinline=forceinline,
+        launch_bounds=launch_bounds,
+        lto=lto,
+    )
+
+    if lto:
+        code = lib.get_ltoir(cc=cc)
+    else:
+        code = lib.get_asm_str(cc=cc)
+    codes = [code]
+
+    # linking_files
+    is_ltoir = output == "ltoir"
+    for path_or_obj in lib._linking_files:
+        obj = LinkableCode.from_path_or_obj(path_or_obj)
+        if obj.kind == "cu":
+            code, log = nvrtc.compile(
+                obj.data,
+                obj.name,
+                cc,
+                ltoir=is_ltoir,
+                lineinfo=lineinfo,
+                debug=debug,
+            )
+            codes.append(code)
+        else:
+            codes.append(obj)
+
+    return codes, resty
+
+
+def _compile_pyfunc_with_fixup(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    forceinline=False,
+    launch_bounds=None,
+    lto=False,
+):
+    """Internal method to compile a python function and perform post-processing
+
+    - If pyfunc is a kernel, post-processing includes kernel fixup and setting
+    launch bounds.
+    - If pyfunc is a device function, post-processing includes ABI wrapper.
+
+    `lto` means that all internal pipeline options use LTO.
+
+    Returns the code library and return type.
+    """
+    if abi not in ("numba", "c"):
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
+
+    if abi == "c" and not device:
+        raise NotImplementedError("The C ABI is not supported for kernels")
+
+    if forceinline and not device:
+        raise ValueError("Cannot force-inline kernels")
+
+    debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
+    opt = (config.OPT != 0) if opt is None else opt
+
+    if debug and opt:
+        msg = (
+            "debug=True with opt=True "
+            "is not supported by CUDA. This may result in a crash"
+            " - set debug=False or opt=False."
+        )
+        warn(NumbaInvalidConfigWarning(msg))
+
+    abi_info = abi_info or dict()
+
+    nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
+
+    if debug:
+        nvvm_options["g"] = None
+
+    if lto:
+        nvvm_options["gen-lto"] = None
+
+    args, return_type = sigutils.normalize_signature(sig)
+
+    cc = _default_cc(cc)
+
+    cres = compile_cuda(
+        pyfunc,
+        return_type,
+        args,
+        debug=debug,
+        lineinfo=lineinfo,
+        fastmath=fastmath,
+        nvvm_options=nvvm_options,
+        cc=cc,
+        forceinline=forceinline,
+    )
+    resty = cres.signature.return_type
+
+    if resty and not device and resty != types.void:
+        raise TypeError("CUDA kernel must have void return type.")
+
+    tgt = cres.target_context
+
+    if device:
+        lib = cres.library
+        if abi == "c":
+            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
+            lib = cabi_wrap_function(
+                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
+            )
+    else:
+        lib = cres.library
+        kernel = lib.get_function(cres.fndesc.llvm_func_name)
+        lib._entry_name = cres.fndesc.llvm_func_name
+        kernel_fixup(kernel, debug)
+        nvvm.set_launch_bounds(kernel, launch_bounds)
+
+    return lib, resty
+
+
 @global_compiler_lock
 def compile(
     pyfunc,
@@ -1050,82 +1220,28 @@ def compile(
     :return: (code, resty): The compiled code and inferred return type
     :rtype: tuple
     """
-    if abi not in ("numba", "c"):
-        raise NotImplementedError(f"Unsupported ABI: {abi}")
-
-    if abi == "c" and not device:
-        raise NotImplementedError("The C ABI is not supported for kernels")
-
     if output not in ("ptx", "ltoir"):
         raise NotImplementedError(f"Unsupported output type: {output}")
 
-    if forceinline and not device:
-        raise ValueError("Cannot force-inline kernels")
-
     if forceinline and output != "ltoir":
         raise ValueError("Can only designate forced inlining in LTO-IR")
 
-    debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
-    opt = (config.OPT != 0) if opt is None else opt
-
-    if debug and opt:
-        msg = (
-            "debug=True with opt=True "
-            "is not supported by CUDA. This may result in a crash"
-            " - set debug=False or opt=False."
-        )
-        warn(NumbaInvalidConfigWarning(msg))
-
     lto = output == "ltoir"
-    abi_info = abi_info or dict()
-
-    nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
-
-    if debug:
-        nvvm_options["g"] = None
-
-    if lto:
-        nvvm_options["gen-lto"] = None
-
-    args, return_type = sigutils.normalize_signature(sig)
-
-    # If the user has used the config variable to specify a non-default that is
-    # greater than the lowest non-deprecated one, then we should default to
-    # their specified CC instead of the lowest non-deprecated one.
-    MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())
-    cc = cc or MIN_CC
-
-    cres = compile_cuda(
+    lib, resty = _compile_pyfunc_with_fixup(
         pyfunc,
-        return_type,
-        args,
+        sig,
         debug=debug,
         lineinfo=lineinfo,
+        device=device,
         fastmath=fastmath,
-        nvvm_options=nvvm_options,
         cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
         forceinline=forceinline,
+        launch_bounds=launch_bounds,
+        lto=lto,
     )
-    resty = cres.signature.return_type
-
-    if resty and not device and resty != types.void:
-        raise TypeError("CUDA kernel must have void return type.")
-
-    tgt = cres.target_context
-
-    if device:
-        lib = cres.library
-        if abi == "c":
-            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
-            lib = cabi_wrap_function(
-                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
-            )
-    else:
-        lib = cres.library
-        kernel = lib.get_function(cres.fndesc.llvm_func_name)
-        lib._entry_name = cres.fndesc.llvm_func_name
-        kernel_fixup(kernel, debug)
-        nvvm.set_launch_bounds(kernel, launch_bounds)
 
     if lto:
         code = lib.get_ltoir(cc=cc)
@@ -1272,3 +1388,14 @@ class ExternFunction:
     def __init__(self, name, sig):
         self.name = name
         self.sig = sig
+
+
+def _default_cc(cc):
+    """
+    Return default compute capability based on config and nvrtc lowest supported cc.
+
+    If user specifies a cc, return that.
+    """
+    if cc:
+        return cc
+    return max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())
diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
index 106d84ed3..c2d8c0bda 100644
--- a/numba_cuda/numba/cuda/cudadrv/linkable_code.py
+++ b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 import io
+import os
+from typing import Union, Type
+
 from .mappings import FILE_EXTENSION_MAP
 
 
@@ -52,6 +55,66 @@ def data(self):
             return self._data.getvalue()
         return self._data
 
+    @staticmethod
+    def from_path(path: str):
+        """
+        Load a linkable code object from a file.
+
+        Parameters
+        ----------
+        path : str
+            The path to the file to load.
+
+        Returns
+        -------
+        LinkableCode
+            The linkable code object.
+
+        Raises
+        ------
+        ValueError
+            If the file extension is not supported.
+        """
+        root, extension = os.path.splitext(path)
+        basename = os.path.basename(root)
+        if extension in (".cu", ".ptx"):
+            mode = "r"
+        else:
+            mode = "rb"
+
+        with open(path, mode) as f:
+            data = f.read()
+
+        cls = _extension_to_linkable_code_kind(extension)
+        return cls(data, name=basename)
+
+    @classmethod
+    def from_path_or_obj(cls, path_or_obj: Union[str, "LinkableCode"]):
+        """
+        Load a linkable code object from a file or a LinkableCode object.
+
+        If a path is provided, the file is loaded and the LinkableCode object
+        is returned. If a LinkableCode object is provided, it is returned as is.
+
+        Parameters
+        ----------
+        path_or_obj : str or LinkableCode
+            The path to the file or the LinkableCode object to load.
+
+        Returns
+        -------
+        LinkableCode
+            The linkable code object.
+
+        Raises
+        ------
+        ValueError
+            If the file extension is not supported.
+        """
+        if isinstance(path_or_obj, str):
+            return cls.from_path(path_or_obj)
+        return path_or_obj
+
 
 class PTXSource(LinkableCode):
     """PTX source code in memory."""
@@ -100,3 +163,22 @@ class LTOIR(LinkableCode):
 
     kind = FILE_EXTENSION_MAP["ltoir"]
     default_name = "<unnamed-ltoir>"
+
+
+def _extension_to_linkable_code_kind(extension: str) -> Type[LinkableCode]:
+    if extension == ".cu":
+        return CUSource
+    elif extension == ".ptx":
+        return PTXSource
+    elif extension == ".fatbin":
+        return Fatbin
+    elif extension == ".cubin":
+        return Cubin
+    elif extension == ".a":
+        return Archive
+    elif extension == ".o":
+        return Object
+    elif extension == ".ltoir":
+        return LTOIR
+    else:
+        raise ValueError(f"Unknown extension: {extension}")
diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
index 23803fc77..cef0b576c 100644
--- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
@@ -289,7 +289,7 @@ def get_lto(self, program):
         return lto
 
 
-def compile(src, name, cc, ltoir=False):
+def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
     """
     Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
 
@@ -301,7 +301,11 @@ def compile(src, name, cc, ltoir=False):
     :type cc: tuple
     :param ltoir: Compile into LTOIR if True, otherwise into PTX
     :type ltoir: bool
-    :return: The compiled PTX and compilation log
+    :param lineinfo: Whether to include line information in the compiled code
+    :type lineinfo: bool
+    :param debug: Whether to include debug information in the compiled code
+    :type debug: bool
+    :return: The compiled PTX or LTOIR and compilation log
     :rtype: tuple
     """
 
@@ -384,6 +388,8 @@ def compile(src, name, cc, ltoir=False):
             relocatable_device_code=True,
             link_time_optimization=ltoir,
             name=name,
+            debug=debug,
+            lineinfo=lineinfo,
         )
 
         class Logger:
@@ -418,6 +424,10 @@ def write(self, msg):
 
         if ltoir:
             options.append("-dlto")
+        if lineinfo:
+            options.append("-lineinfo")
+        if debug:
+            options.append("-G")
 
         # Compile the program
         compile_error = nvrtc.compile_program(program, options)
diff --git a/numba_cuda/numba/cuda/simulator/__init__.py b/numba_cuda/numba/cuda/simulator/__init__.py
index da0d4b233..27cb64ed0 100644
--- a/numba_cuda/numba/cuda/simulator/__init__.py
+++ b/numba_cuda/numba/cuda/simulator/__init__.py
@@ -20,6 +20,7 @@
 from .cudadrv.devices import require_context, gpus
 from .cudadrv.devices import get_context as current_context
 from .cudadrv.runtime import runtime
+from .cudadrv.linkable_code import LinkableCode
 from numba.cuda.core import config
 
 reduce = Reduce
diff --git a/numba_cuda/numba/cuda/simulator/compiler.py b/numba_cuda/numba/cuda/simulator/compiler.py
index e11aeb2f0..1bf5a3af0 100644
--- a/numba_cuda/numba/cuda/simulator/compiler.py
+++ b/numba_cuda/numba/cuda/simulator/compiler.py
@@ -11,6 +11,7 @@
 compile_ptx = None
 compile_ptx_for_current_device = None
 declare_device_function = None
+compile_all = None
 
 
 def run_frontend(func, inline_closures=False, emit_dels=False):
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py
new file mode 100644
index 000000000..39b07d710
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+import os
+from numba import cuda
+from numba.cuda.cudadrv.linkable_code import LinkableCode
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim
+import unittest
+
+TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
+if TEST_BIN_DIR:
+    test_device_functions_a = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.a"
+    )
+    test_device_functions_cubin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cubin"
+    )
+    test_device_functions_cu = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cu"
+    )
+    test_device_functions_fatbin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.fatbin"
+    )
+    test_device_functions_fatbin_multi = os.path.join(
+        TEST_BIN_DIR, "test_device_functions_multi.fatbin"
+    )
+    test_device_functions_o = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.o"
+    )
+    test_device_functions_ptx = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ptx"
+    )
+    test_device_functions_ltoir = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ltoir"
+    )
+
+
+class TestLinkableCode(CUDATestCase):
+    @skip_on_cudasim(reason="Simulator does not support linkable code")
+    @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
+    def test_linkable_code_from_path_or_obj(self):
+        files_kind = [
+            (test_device_functions_a, cuda.Archive),
+            (test_device_functions_cubin, cuda.Cubin),
+            (test_device_functions_cu, cuda.CUSource),
+            (test_device_functions_fatbin, cuda.Fatbin),
+            (test_device_functions_o, cuda.Object),
+            (test_device_functions_ptx, cuda.PTXSource),
+            (test_device_functions_ltoir, cuda.LTOIR),
+        ]
+
+        for path, kind in files_kind:
+            obj = LinkableCode.from_path_or_obj(path)
+            assert isinstance(obj, kind)
+
+            # test identity of from_path_or_obj
+            obj2 = LinkableCode.from_path_or_obj(obj)
+            assert obj2 is obj
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
index 474b19710..5142c5397 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
@@ -1,16 +1,57 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import os
 from math import sqrt
-from numba import cuda, float32, int16, int32, int64, types, uint32, void
+
+
+from numba import (
+    cuda,
+    float32,
+    int16,
+    int32,
+    int64,
+    types,
+    uint32,
+    void,
+)
 from numba.cuda import (
     compile,
     compile_for_current_device,
     compile_ptx,
     compile_ptx_for_current_device,
+    compile_all,
+    LinkableCode,
 )
 from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
 
+TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
+if TEST_BIN_DIR:
+    test_device_functions_a = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.a"
+    )
+    test_device_functions_cubin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cubin"
+    )
+    test_device_functions_cu = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.cu"
+    )
+    test_device_functions_fatbin = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.fatbin"
+    )
+    test_device_functions_fatbin_multi = os.path.join(
+        TEST_BIN_DIR, "test_device_functions_multi.fatbin"
+    )
+    test_device_functions_o = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.o"
+    )
+    test_device_functions_ptx = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ptx"
+    )
+    test_device_functions_ltoir = os.path.join(
+        TEST_BIN_DIR, "test_device_functions.ltoir"
+    )
+
 
 # A test function at the module scope to ensure we get the name right for the C
 # ABI whether a function is at module or local scope.
@@ -20,14 +61,33 @@ def f_module(x, y):
 
 @skip_on_cudasim("Compilation unsupported in the simulator")
 class TestCompile(unittest.TestCase):
+    def _handle_compile_result(self, ret, compile_function):
+        ptx_or_code_list, resty = ret
+        if compile_function in (compile_ptx, compile):
+            ptx = ptx_or_code_list
+        else:
+            ptx = ptx_or_code_list[0]
+        return ptx, resty
+
     def test_global_kernel(self):
+        with self.subTest("compile_ptx"):
+            self._test_global_kernel(compile_ptx, {})
+
+        with self.subTest("compile_all"):
+            self._test_global_kernel(
+                compile_all, {"device": False, "abi": "numba", "output": "ptx"}
+            )
+
+    def _test_global_kernel(self, compile_function, default_kwargs):
         def f(r, x, y):
             i = cuda.grid(1)
             if i < len(r):
                 r[i] = x[i] + y[i]
 
         args = (float32[:], float32[:], float32[:])
-        ptx, resty = compile_ptx(f, args)
+
+        ret = compile_function(f, args, **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
 
         # Kernels should not have a func_retval parameter
         self.assertNotIn("func_retval", ptx)
@@ -39,11 +99,22 @@ def f(r, x, y):
         self.assertEqual(resty, void)
 
     def test_device_function(self):
+        with self.subTest("compile_ptx"):
+            self._test_device_function(compile_ptx, {"device": True})
+
+        with self.subTest("compile_all"):
+            self._test_device_function(
+                compile_all, {"device": True, "abi": "c", "output": "ptx"}
+            )
+
+    def _test_device_function(self, compile_function, default_kwargs):
         def add(x, y):
             return x + y
 
         args = (float32, float32)
-        ptx, resty = compile_ptx(add, args, device=True)
+
+        ret = compile_function(add, args, **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
 
         # Device functions take a func_retval parameter for storing the
         # returned value in by reference
@@ -57,33 +128,44 @@ def add(x, y):
 
         # Check that function's output matches signature
         sig_int32 = int32(int32, int32)
-        ptx, resty = compile_ptx(add, sig_int32, device=True)
+        ret = compile_function(add, sig_int32, **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.assertEqual(resty, int32)
 
         sig_int16 = int16(int16, int16)
-        ptx, resty = compile_ptx(add, sig_int16, device=True)
+        ret = compile_function(add, sig_int16, **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.assertEqual(resty, int16)
         # Using string as signature
         sig_string = "uint32(uint32, uint32)"
-        ptx, resty = compile_ptx(add, sig_string, device=True)
+        ret = compile_function(add, sig_string, **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.assertEqual(resty, uint32)
 
     def test_fastmath(self):
+        with self.subTest("compile_ptx"):
+            self._test_fastmath(compile_ptx, {"device": True})
+
+        with self.subTest("compile_all"):
+            self._test_fastmath(compile_all, {"device": True, "output": "ptx"})
+
+    def _test_fastmath(self, compile_function, default_kwargs):
         def f(x, y, z, d):
             return sqrt((x * y + z) / d)
 
         args = (float32, float32, float32, float32)
-        ptx, resty = compile_ptx(f, args, device=True)
 
         # Without fastmath, fma contraction is enabled by default, but ftz and
-        # approximate div / sqrt is not.
+        # approximate div / sqrt are not.
+        ret = compile_function(f, args, **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.assertIn("fma.rn.f32", ptx)
         self.assertIn("div.rn.f32", ptx)
         self.assertIn("sqrt.rn.f32", ptx)
 
-        ptx, resty = compile_ptx(f, args, device=True, fastmath=True)
-
         # With fastmath, ftz and approximate div / sqrt are enabled
+        ret = compile_function(f, args, fastmath=True, **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.assertIn("fma.rn.ftz.f32", ptx)
         self.assertIn("div.approx.ftz.f32", ptx)
         self.assertIn("sqrt.approx.ftz.f32", ptx)
@@ -104,18 +186,59 @@ def test_device_function_with_debug(self):
         # and NVVM assumed DBG version 1.0 if not specified, which is
         # incompatible with the 3.0 IR we use. This was specified only for
         # kernels.
+
+        with self.subTest("compile_ptx"):
+            self._test_device_function_with_debug(
+                compile_ptx, {"device": True, "debug": True, "opt": False}
+            )
+
+        with self.subTest("compile_all"):
+            self._test_device_function_with_debug(
+                compile_all,
+                {
+                    "device": True,
+                    "debug": True,
+                    "opt": False,
+                    "output": "ptx",
+                },
+            )
+
+    def _test_device_function_with_debug(
+        self, compile_function, default_kwargs
+    ):
         def f():
             pass
 
-        ptx, resty = compile_ptx(f, (), device=True, debug=True, opt=False)
+        ret = compile_function(f, (), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.check_debug_info(ptx)
 
     def test_kernel_with_debug(self):
         # Inspired by (but not originally affected by) Issue #6719
+
+        with self.subTest("compile_ptx"):
+            self._test_kernel_with_debug(
+                compile_ptx, {"debug": True, "opt": False}
+            )
+
+        with self.subTest("compile_all"):
+            self._test_kernel_with_debug(
+                compile_all,
+                {
+                    "device": False,
+                    "abi": "numba",
+                    "debug": True,
+                    "opt": False,
+                    "output": "ptx",
+                },
+            )
+
+    def _test_kernel_with_debug(self, compile_function, default_kwargs):
         def f():
             pass
 
-        ptx, resty = compile_ptx(f, (), debug=True, opt=False)
+        ret = compile_function(f, (), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.check_debug_info(ptx)
 
     def check_line_info(self, ptx):
@@ -125,52 +248,131 @@ def check_line_info(self, ptx):
         self.assertRegex(ptx, '\\.file.*test_compiler.py"')
 
     def test_device_function_with_line_info(self):
+        with self.subTest("compile_ptx"):
+            self._test_device_function_with_line_info(
+                compile_ptx, {"device": True, "lineinfo": True}
+            )
+
+        with self.subTest("compile_all"):
+            self._test_device_function_with_line_info(
+                compile_all,
+                {
+                    "device": True,
+                    "abi": "numba",
+                    "lineinfo": True,
+                    "output": "ptx",
+                },
+            )
+
+    def _test_device_function_with_line_info(
+        self, compile_function, default_kwargs
+    ):
         def f():
             pass
 
-        ptx, resty = compile_ptx(f, (), device=True, lineinfo=True)
+        ret = compile_function(f, (), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.check_line_info(ptx)
 
     def test_kernel_with_line_info(self):
+        with self.subTest("compile_ptx"):
+            self._test_kernel_with_line_info(compile_ptx, {"lineinfo": True})
+
+        with self.subTest("compile_all"):
+            self._test_kernel_with_line_info(
+                compile_all,
+                {
+                    "device": False,
+                    "abi": "numba",
+                    "lineinfo": True,
+                    "output": "ptx",
+                },
+            )
+
+    def _test_kernel_with_line_info(self, compile_function, default_kwargs):
         def f():
             pass
 
-        ptx, resty = compile_ptx(f, (), lineinfo=True)
+        ret = compile_function(f, (), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.check_line_info(ptx)
 
     def test_non_void_return_type(self):
         def f(x, y):
             return x[0] + y[0]
 
-        with self.assertRaisesRegex(TypeError, "must have void return type"):
-            compile_ptx(f, (uint32[::1], uint32[::1]))
+        with self.subTest("compile_ptx"):
+            with self.assertRaisesRegex(
+                TypeError, "must have void return type"
+            ):
+                compile_ptx(f, (uint32[::1], uint32[::1]))
+
+        with self.subTest("compile_all"):
+            with self.assertRaisesRegex(
+                TypeError, "must have void return type"
+            ):
+                compile_all(
+                    f,
+                    (uint32[::1], uint32[::1]),
+                    device=False,
+                    abi="numba",
+                    output="ptx",
+                )
 
     def test_c_abi_disallowed_for_kernel(self):
         def f(x, y):
             return x + y
 
-        with self.assertRaisesRegex(
-            NotImplementedError, "The C ABI is not supported for kernels"
-        ):
-            compile_ptx(f, (int32, int32), abi="c")
+        with self.subTest("compile_ptx"):
+            with self.assertRaisesRegex(
+                NotImplementedError, "The C ABI is not supported for kernels"
+            ):
+                compile_ptx(f, (int32, int32), abi="c")
+
+        with self.subTest("compile_all"):
+            with self.assertRaisesRegex(
+                NotImplementedError, "The C ABI is not supported for kernels"
+            ):
+                compile_all(
+                    f, (int32, int32), abi="c", device=False, output="ptx"
+                )
 
     def test_unsupported_abi(self):
         def f(x, y):
             return x + y
 
-        with self.assertRaisesRegex(
-            NotImplementedError, "Unsupported ABI: fastcall"
-        ):
-            compile_ptx(f, (int32, int32), abi="fastcall")
+        with self.subTest("compile_ptx"):
+            with self.assertRaisesRegex(
+                NotImplementedError, "Unsupported ABI: fastcall"
+            ):
+                compile_ptx(f, (int32, int32), abi="fastcall")
+
+        with self.subTest("compile_all"):
+            with self.assertRaisesRegex(
+                NotImplementedError, "Unsupported ABI: fastcall"
+            ):
+                compile_all(f, (int32, int32), abi="fastcall", output="ptx")
 
     def test_c_abi_device_function(self):
+        with self.subTest("compile_ptx"):
+            self._test_c_abi_device_function(
+                compile_ptx, {"device": True, "abi": "c"}
+            )
+
+        with self.subTest("compile_all"):
+            self._test_c_abi_device_function(
+                compile_all, {"device": True, "abi": "c", "output": "ptx"}
+            )
+
+    def _test_c_abi_device_function(self, compile_function, default_kwargs):
         def f(x, y):
             return x + y
 
-        ptx, resty = compile_ptx(f, int32(int32, int32), device=True, abi="c")
+        # 32-bit signature
+        ret = compile_function(f, int32(int32, int32), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         # There should be no more than two parameters
         self.assertNotIn(ptx, "param_2")
-
         # The function name should match the Python function name (not the
         # qualname, which includes additional info), and its return value
         # should be 32 bits
@@ -180,15 +382,28 @@ def f(x, y):
             r"func_retval0\)\s+f\(",
         )
 
-        # If we compile for 64-bit integers, the return type should be 64 bits
-        # wide
-        ptx, resty = compile_ptx(f, int64(int64, int64), device=True, abi="c")
+        # 64-bit signature should produce 64-bit return parameter
+        ret = compile_function(f, int64(int64, int64), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
         self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64")
 
     def test_c_abi_device_function_module_scope(self):
-        ptx, resty = compile_ptx(
-            f_module, int32(int32, int32), device=True, abi="c"
-        )
+        with self.subTest("compile_ptx"):
+            self._test_c_abi_device_function_module_scope(
+                compile_ptx, {"device": True, "abi": "c"}
+            )
+
+        with self.subTest("compile_all"):
+            self._test_c_abi_device_function_module_scope(
+                compile_all,
+                {"device": True, "abi": "c", "output": "ptx"},
+            )
+
+    def _test_c_abi_device_function_module_scope(
+        self, compile_function, default_kwargs
+    ):
+        ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
 
         # The function name should match the Python function name, and its
         # return value should be 32 bits
@@ -200,13 +415,27 @@ def test_c_abi_device_function_module_scope(self):
 
     def test_c_abi_with_abi_name(self):
         abi_info = {"abi_name": "_Z4funcii"}
-        ptx, resty = compile_ptx(
-            f_module,
-            int32(int32, int32),
-            device=True,
-            abi="c",
-            abi_info=abi_info,
-        )
+
+        with self.subTest("compile_ptx"):
+            self._test_c_abi_with_abi_name(
+                compile_ptx,
+                {"device": True, "abi": "c", "abi_info": abi_info},
+            )
+
+        with self.subTest("compile_all"):
+            self._test_c_abi_with_abi_name(
+                compile_all,
+                {
+                    "device": True,
+                    "abi": "c",
+                    "abi_info": abi_info,
+                    "output": "ptx",
+                },
+            )
+
+    def _test_c_abi_with_abi_name(self, compile_function, default_kwargs):
+        ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
 
         # The function name should match the one given in the ABI info, and its
         # return value should be 32 bits
@@ -217,7 +446,18 @@ def test_c_abi_with_abi_name(self):
         )
 
     def test_compile_defaults_to_c_abi(self):
-        ptx, resty = compile(f_module, int32(int32, int32), device=True)
+        with self.subTest("compile"):
+            self._test_compile_defaults_to_c_abi(compile, {"device": True})
+
+        with self.subTest("compile_all"):
+            self._test_compile_defaults_to_c_abi(
+                compile_all,
+                {"device": True, "output": "ptx"},
+            )
+
+    def _test_compile_defaults_to_c_abi(self, compile_function, default_kwargs):
+        ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
+        ptx, resty = self._handle_compile_result(ret, compile_function)
 
         # The function name should match the Python function name, and its
         # return value should be 32 bits
@@ -228,28 +468,50 @@ def test_compile_defaults_to_c_abi(self):
         )
 
     def test_compile_to_ltoir(self):
-        ltoir, resty = compile(
-            f_module, int32(int32, int32), device=True, output="ltoir"
-        )
+        with self.subTest("compile"):
+            self._test_compile_to_ltoir(
+                compile, {"device": True, "output": "ltoir"}
+            )
+
+        with self.subTest("compile_all"):
+            self._test_compile_to_ltoir(
+                compile_all,
+                {"device": True, "abi": "c", "output": "ltoir"},
+            )
+
+    def _test_compile_to_ltoir(self, compile_function, default_kwargs):
+        ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
+        code, resty = self._handle_compile_result(ret, compile_function)
 
         # There are no tools to interpret the LTOIR output, but we can check
         # that we appear to have obtained an LTOIR file. This magic number is
         # not documented, but is expected to remain consistent.
         LTOIR_MAGIC = 0x7F4E43ED
-        header = int.from_bytes(ltoir[:4], byteorder="little")
+        header = int.from_bytes(code[:4], byteorder="little")
         self.assertEqual(header, LTOIR_MAGIC)
         self.assertEqual(resty, int32)
 
     def test_compile_to_invalid_error(self):
         illegal_output = "illegal"
         msg = f"Unsupported output type: {illegal_output}"
-        with self.assertRaisesRegex(NotImplementedError, msg):
-            compile(
-                f_module,
-                int32(int32, int32),
-                device=True,
-                output=illegal_output,
-            )
+        with self.subTest("compile"):
+            with self.assertRaisesRegex(NotImplementedError, msg):
+                compile(
+                    f_module,
+                    int32(int32, int32),
+                    device=True,
+                    output=illegal_output,
+                )
+
+        with self.subTest("compile_all"):
+            with self.assertRaisesRegex(NotImplementedError, msg):
+                compile_all(
+                    f_module,
+                    int32(int32, int32),
+                    device=True,
+                    abi="c",
+                    output=illegal_output,
+                )
 
     def test_functioncompiler_locals(self):
         # Tests against regression fixed in:
@@ -266,6 +528,89 @@ def f(b_arg):
             if cond:
                 b_smem[0] = b_arg[0]
 
+    @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
+    def test_compile_all_with_external_functions(self):
+        for link in [
+            test_device_functions_a,
+            test_device_functions_cubin,
+            test_device_functions_cu,
+            test_device_functions_fatbin,
+            test_device_functions_fatbin_multi,
+            test_device_functions_o,
+            test_device_functions_ptx,
+            test_device_functions_ltoir,
+        ]:
+            with self.subTest(link=link):
+                add = cuda.declare_device(
+                    "add_from_numba", "uint32(uint32, uint32)", link=[link]
+                )
+
+                def f(z, x, y):
+                    z[0] = add(x, y)
+
+                code_list, resty = compile_all(
+                    f, (uint32[::1], uint32, uint32), device=False, abi="numba"
+                )
+
+                assert resty == void
+                assert len(code_list) == 2
+                link_obj = LinkableCode.from_path(link)
+                if link_obj.kind == "cu":
+                    # if link is a cu file, result contains a compiled object code
+                    if cuda.config.CUDA_USE_NVIDIA_BINDING:
+                        from cuda.core.experimental import ObjectCode
+
+                        assert isinstance(code_list[1], ObjectCode)
+                    else:
+                        assert isinstance(code_list[1], bytes)
+                else:
+                    assert code_list[1].kind == link_obj.kind
+
+    @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
+    def test_compile_all_lineinfo(self):
+        add = cuda.declare_device(
+            "add", "float32(float32, float32)", link=[test_device_functions_cu]
+        )
+
+        def f(z, x, y):
+            z[0] = add(x, y)
+
+        args = (float32[::1], float32, float32)
+        code_list, resty = compile_all(
+            f, args, lineinfo=True, output="ptx", device=False, abi="numba"
+        )
+        assert len(code_list) == 2
+
+        if cuda.config.CUDA_USE_NVIDIA_BINDING:
+            self.assertRegex(
+                str(code_list[1].code.decode()),
+                r"\.file.*test_device_functions",
+            )
+        else:
+            self.assertRegex(code_list[1], r"\.file.*test_device_functions")
+
+    @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
+    def test_compile_all_debug(self):
+        add = cuda.declare_device(
+            "add", "float32(float32, float32)", link=[test_device_functions_cu]
+        )
+
+        def f(z, x, y):
+            z[0] = add(x, y)
+
+        args = (float32[::1], float32, float32)
+        code_list, resty = compile_all(
+            f, args, debug=True, output="ptx", device=False, abi="numba"
+        )
+        assert len(code_list) == 2
+
+        if cuda.config.CUDA_USE_NVIDIA_BINDING:
+            self.assertRegex(
+                str(code_list[1].code.decode()), r"\.section\s+\.debug_info"
+            )
+        else:
+            self.assertRegex(code_list[1], r"\.section\s+\.debug_info")
+
 
 @skip_on_cudasim("Compilation unsupported in the simulator")
 class TestCompileForCurrentDevice(CUDATestCase):