diff --git a/docs/source/reference/host.rst b/docs/source/reference/host.rst index 64bebd2d9..dbfd06d66 100644 --- a/docs/source/reference/host.rst +++ b/docs/source/reference/host.rst @@ -125,6 +125,8 @@ any of the driver API. This can be useful for: .. autofunction:: numba.cuda.compile +.. autofunction:: numba.cuda.compile_all + The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control the default compute capability targeted by ``compile`` - see diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py index 2c82b70d8..728faa355 100644 --- a/numba_cuda/numba/cuda/__init__.py +++ b/numba_cuda/numba/cuda/__init__.py @@ -78,6 +78,7 @@ compile_for_current_device, compile_ptx, compile_ptx_for_current_device, + compile_all, ) # This is the out-of-tree NVIDIA-maintained target. This is reported in Numba diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index 134311b07..c40253c16 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -471,6 +471,7 @@ def _reduce_states(self): needs_cudadevrt=self.needs_cudadevrt, nrt=nrt, use_cooperative=self.use_cooperative, + lto=self._lto, ) @classmethod @@ -488,6 +489,7 @@ def _rebuild( needs_cudadevrt, nrt, use_cooperative, + lto, ): """ Rebuild an instance. @@ -508,6 +510,7 @@ def _rebuild( if nrt: instance._linking_files = {NRT_LIBRARY} + instance._lto = lto return instance diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py index 40ff6ab00..bb6cd1c67 100644 --- a/numba_cuda/numba/cuda/compiler.py +++ b/numba_cuda/numba/cuda/compiler.py @@ -28,6 +28,7 @@ funcdesc, ) from numba.cuda.cudadrv import nvvm, nvrtc +from numba.cuda.cudadrv.linkable_code import LinkableCode from numba.cuda.descriptor import cuda_target from numba.cuda.flags import CUDAFlags from numba.cuda.target import CUDACABICallConv @@ -977,6 +978,175 @@ def define_error_gv(postfix): return helper_func +def compile_all( + pyfunc, + sig, + debug=None, + lineinfo=False, + device=True, + fastmath=False, + cc=None, + opt=None, + abi="c", + abi_info=None, + output="ltoir", + forceinline=False, + launch_bounds=None, +): + """Similar to ``compile()``, but returns a list of PTX codes/LTO-IRs for + the compiled function and the external functions it depends on. + If external functions are CUDA C++ source, they will be compiled with + NVRTC. Other kinds of external function code (e.g. cubins, fatbins, etc.) + will be added directly to the return list. The output code kind is + determined by the ``output`` parameter, and defaults to ``"ltoir"``. + """ + + if output not in ("ptx", "ltoir"): + raise NotImplementedError(f"Unsupported output type: {output}") + + if forceinline and output != "ltoir": + raise ValueError("Can only designate forced inlining in LTO-IR") + + lto = output == "ltoir" + + cc = _default_cc(cc) + + lib, resty = _compile_pyfunc_with_fixup( + pyfunc, + sig, + debug=debug, + lineinfo=lineinfo, + device=device, + fastmath=fastmath, + cc=cc, + opt=opt, + abi=abi, + abi_info=abi_info, + forceinline=forceinline, + launch_bounds=launch_bounds, + lto=lto, + ) + + if lto: + code = lib.get_ltoir(cc=cc) + else: + code = lib.get_asm_str(cc=cc) + codes = [code] + + # linking_files + is_ltoir = output == "ltoir" + for path_or_obj in lib._linking_files: + obj = LinkableCode.from_path_or_obj(path_or_obj) + if obj.kind == "cu": + code, log = nvrtc.compile( + obj.data, + obj.name, + cc, + ltoir=is_ltoir, + lineinfo=lineinfo, + debug=debug, + ) + codes.append(code) + else: + codes.append(obj) + + return codes, resty + + +def _compile_pyfunc_with_fixup( + pyfunc, + sig, + debug=None, + lineinfo=False, + device=True, + fastmath=False, + cc=None, + opt=None, + abi="c", + abi_info=None, + forceinline=False, + launch_bounds=None, + lto=False, +): + """Internal method to compile a python function and perform post-processing + + - If pyfunc is a kernel, post-processing includes kernel fixup and setting + launch bounds. + - If pyfunc is a device function, post-processing includes ABI wrapper. + + `lto` means that all internal pipeline options use LTO. + + Returns the code library and return type. + """ + if abi not in ("numba", "c"): + raise NotImplementedError(f"Unsupported ABI: {abi}") + + if abi == "c" and not device: + raise NotImplementedError("The C ABI is not supported for kernels") + + if forceinline and not device: + raise ValueError("Cannot force-inline kernels") + + debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug + opt = (config.OPT != 0) if opt is None else opt + + if debug and opt: + msg = ( + "debug=True with opt=True " + "is not supported by CUDA. This may result in a crash" + " - set debug=False or opt=False." + ) + warn(NumbaInvalidConfigWarning(msg)) + + abi_info = abi_info or dict() + + nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0} + + if debug: + nvvm_options["g"] = None + + if lto: + nvvm_options["gen-lto"] = None + + args, return_type = sigutils.normalize_signature(sig) + + cc = _default_cc(cc) + + cres = compile_cuda( + pyfunc, + return_type, + args, + debug=debug, + lineinfo=lineinfo, + fastmath=fastmath, + nvvm_options=nvvm_options, + cc=cc, + forceinline=forceinline, + ) + resty = cres.signature.return_type + + if resty and not device and resty != types.void: + raise TypeError("CUDA kernel must have void return type.") + + tgt = cres.target_context + + if device: + lib = cres.library + if abi == "c": + wrapper_name = abi_info.get("abi_name", pyfunc.__name__) + lib = cabi_wrap_function( + tgt, lib, cres.fndesc, wrapper_name, nvvm_options + ) + else: + lib = cres.library + kernel = lib.get_function(cres.fndesc.llvm_func_name) + lib._entry_name = cres.fndesc.llvm_func_name + kernel_fixup(kernel, debug) + nvvm.set_launch_bounds(kernel, launch_bounds) + + return lib, resty + + @global_compiler_lock def compile( pyfunc, @@ -1050,82 +1220,28 @@ def compile( :return: (code, resty): The compiled code and inferred return type :rtype: tuple """ - if abi not in ("numba", "c"): - raise NotImplementedError(f"Unsupported ABI: {abi}") - - if abi == "c" and not device: - raise NotImplementedError("The C ABI is not supported for kernels") - if output not in ("ptx", "ltoir"): raise NotImplementedError(f"Unsupported output type: {output}") - if forceinline and not device: - raise ValueError("Cannot force-inline kernels") - if forceinline and output != "ltoir": raise ValueError("Can only designate forced inlining in LTO-IR") - debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug - opt = (config.OPT != 0) if opt is None else opt - - if debug and opt: - msg = ( - "debug=True with opt=True " - "is not supported by CUDA. This may result in a crash" - " - set debug=False or opt=False." - ) - warn(NumbaInvalidConfigWarning(msg)) - lto = output == "ltoir" - abi_info = abi_info or dict() - - nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0} - - if debug: - nvvm_options["g"] = None - - if lto: - nvvm_options["gen-lto"] = None - - args, return_type = sigutils.normalize_signature(sig) - - # If the user has used the config variable to specify a non-default that is - # greater than the lowest non-deprecated one, then we should default to - # their specified CC instead of the lowest non-deprecated one. - MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc()) - cc = cc or MIN_CC - - cres = compile_cuda( + lib, resty = _compile_pyfunc_with_fixup( pyfunc, - return_type, - args, + sig, debug=debug, lineinfo=lineinfo, + device=device, fastmath=fastmath, - nvvm_options=nvvm_options, cc=cc, + opt=opt, + abi=abi, + abi_info=abi_info, forceinline=forceinline, + launch_bounds=launch_bounds, + lto=lto, ) - resty = cres.signature.return_type - - if resty and not device and resty != types.void: - raise TypeError("CUDA kernel must have void return type.") - - tgt = cres.target_context - - if device: - lib = cres.library - if abi == "c": - wrapper_name = abi_info.get("abi_name", pyfunc.__name__) - lib = cabi_wrap_function( - tgt, lib, cres.fndesc, wrapper_name, nvvm_options - ) - else: - lib = cres.library - kernel = lib.get_function(cres.fndesc.llvm_func_name) - lib._entry_name = cres.fndesc.llvm_func_name - kernel_fixup(kernel, debug) - nvvm.set_launch_bounds(kernel, launch_bounds) if lto: code = lib.get_ltoir(cc=cc) @@ -1272,3 +1388,14 @@ class ExternFunction: def __init__(self, name, sig): self.name = name self.sig = sig + + +def _default_cc(cc): + """ + Return default compute capability based on config and nvrtc lowest supported cc. + + If user specifies a cc, return that. + """ + if cc: + return cc + return max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc()) diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py index 106d84ed3..c2d8c0bda 100644 --- a/numba_cuda/numba/cuda/cudadrv/linkable_code.py +++ b/numba_cuda/numba/cuda/cudadrv/linkable_code.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: BSD-2-Clause import io +import os +from typing import Union, Type + from .mappings import FILE_EXTENSION_MAP @@ -52,6 +55,66 @@ def data(self): return self._data.getvalue() return self._data + @staticmethod + def from_path(path: str): + """ + Load a linkable code object from a file. + + Parameters + ---------- + path : str + The path to the file to load. + + Returns + ------- + LinkableCode + The linkable code object. + + Raises + ------ + ValueError + If the file extension is not supported. + """ + root, extension = os.path.splitext(path) + basename = os.path.basename(root) + if extension in (".cu", ".ptx"): + mode = "r" + else: + mode = "rb" + + with open(path, mode) as f: + data = f.read() + + cls = _extension_to_linkable_code_kind(extension) + return cls(data, name=basename) + + @classmethod + def from_path_or_obj(cls, path_or_obj: Union[str, "LinkableCode"]): + """ + Load a linkable code object from a file or a LinkableCode object. + + If a path is provided, the file is loaded and the LinkableCode object + is returned. If a LinkableCode object is provided, it is returned as is. + + Parameters + ---------- + path_or_obj : str or LinkableCode + The path to the file or the LinkableCode object to load. + + Returns + ------- + LinkableCode + The linkable code object. + + Raises + ------ + ValueError + If the file extension is not supported. + """ + if isinstance(path_or_obj, str): + return cls.from_path(path_or_obj) + return path_or_obj + class PTXSource(LinkableCode): """PTX source code in memory.""" @@ -100,3 +163,22 @@ class LTOIR(LinkableCode): kind = FILE_EXTENSION_MAP["ltoir"] default_name = "" + + +def _extension_to_linkable_code_kind(extension: str) -> Type[LinkableCode]: + if extension == ".cu": + return CUSource + elif extension == ".ptx": + return PTXSource + elif extension == ".fatbin": + return Fatbin + elif extension == ".cubin": + return Cubin + elif extension == ".a": + return Archive + elif extension == ".o": + return Object + elif extension == ".ltoir": + return LTOIR + else: + raise ValueError(f"Unknown extension: {extension}") diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py index 23803fc77..cef0b576c 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py +++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py @@ -289,7 +289,7 @@ def get_lto(self, program): return lto -def compile(src, name, cc, ltoir=False): +def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False): """ Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability. @@ -301,7 +301,11 @@ def compile(src, name, cc, ltoir=False): :type cc: tuple :param ltoir: Compile into LTOIR if True, otherwise into PTX :type ltoir: bool - :return: The compiled PTX and compilation log + :param lineinfo: Whether to include line information in the compiled code + :type lineinfo: bool + :param debug: Whether to include debug information in the compiled code + :type debug: bool + :return: The compiled PTX or LTOIR and compilation log :rtype: tuple """ @@ -384,6 +388,8 @@ def compile(src, name, cc, ltoir=False): relocatable_device_code=True, link_time_optimization=ltoir, name=name, + debug=debug, + lineinfo=lineinfo, ) class Logger: @@ -418,6 +424,10 @@ def write(self, msg): if ltoir: options.append("-dlto") + if lineinfo: + options.append("-lineinfo") + if debug: + options.append("-G") # Compile the program compile_error = nvrtc.compile_program(program, options) diff --git a/numba_cuda/numba/cuda/simulator/__init__.py b/numba_cuda/numba/cuda/simulator/__init__.py index da0d4b233..27cb64ed0 100644 --- a/numba_cuda/numba/cuda/simulator/__init__.py +++ b/numba_cuda/numba/cuda/simulator/__init__.py @@ -20,6 +20,7 @@ from .cudadrv.devices import require_context, gpus from .cudadrv.devices import get_context as current_context from .cudadrv.runtime import runtime +from .cudadrv.linkable_code import LinkableCode from numba.cuda.core import config reduce = Reduce diff --git a/numba_cuda/numba/cuda/simulator/compiler.py b/numba_cuda/numba/cuda/simulator/compiler.py index e11aeb2f0..1bf5a3af0 100644 --- a/numba_cuda/numba/cuda/simulator/compiler.py +++ b/numba_cuda/numba/cuda/simulator/compiler.py @@ -11,6 +11,7 @@ compile_ptx = None compile_ptx_for_current_device = None declare_device_function = None +compile_all = None def run_frontend(func, inline_closures=False, emit_dels=False): diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py new file mode 100644 index 000000000..39b07d710 --- /dev/null +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + +import os +from numba import cuda +from numba.cuda.cudadrv.linkable_code import LinkableCode +from numba.cuda.testing import CUDATestCase, skip_on_cudasim +import unittest + +TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR") +if TEST_BIN_DIR: + test_device_functions_a = os.path.join( + TEST_BIN_DIR, "test_device_functions.a" + ) + test_device_functions_cubin = os.path.join( + TEST_BIN_DIR, "test_device_functions.cubin" + ) + test_device_functions_cu = os.path.join( + TEST_BIN_DIR, "test_device_functions.cu" + ) + test_device_functions_fatbin = os.path.join( + TEST_BIN_DIR, "test_device_functions.fatbin" + ) + test_device_functions_fatbin_multi = os.path.join( + TEST_BIN_DIR, "test_device_functions_multi.fatbin" + ) + test_device_functions_o = os.path.join( + TEST_BIN_DIR, "test_device_functions.o" + ) + test_device_functions_ptx = os.path.join( + TEST_BIN_DIR, "test_device_functions.ptx" + ) + test_device_functions_ltoir = os.path.join( + TEST_BIN_DIR, "test_device_functions.ltoir" + ) + + +class TestLinkableCode(CUDATestCase): + @skip_on_cudasim(reason="Simulator does not support linkable code") + @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.") + def test_linkable_code_from_path_or_obj(self): + files_kind = [ + (test_device_functions_a, cuda.Archive), + (test_device_functions_cubin, cuda.Cubin), + (test_device_functions_cu, cuda.CUSource), + (test_device_functions_fatbin, cuda.Fatbin), + (test_device_functions_o, cuda.Object), + (test_device_functions_ptx, cuda.PTXSource), + (test_device_functions_ltoir, cuda.LTOIR), + ] + + for path, kind in files_kind: + obj = LinkableCode.from_path_or_obj(path) + assert isinstance(obj, kind) + + # test identity of from_path_or_obj + obj2 = LinkableCode.from_path_or_obj(obj) + assert obj2 is obj diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py index 474b19710..5142c5397 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py @@ -1,16 +1,57 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-2-Clause +import os from math import sqrt -from numba import cuda, float32, int16, int32, int64, types, uint32, void + + +from numba import ( + cuda, + float32, + int16, + int32, + int64, + types, + uint32, + void, +) from numba.cuda import ( compile, compile_for_current_device, compile_ptx, compile_ptx_for_current_device, + compile_all, + LinkableCode, ) from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase +TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR") +if TEST_BIN_DIR: + test_device_functions_a = os.path.join( + TEST_BIN_DIR, "test_device_functions.a" + ) + test_device_functions_cubin = os.path.join( + TEST_BIN_DIR, "test_device_functions.cubin" + ) + test_device_functions_cu = os.path.join( + TEST_BIN_DIR, "test_device_functions.cu" + ) + test_device_functions_fatbin = os.path.join( + TEST_BIN_DIR, "test_device_functions.fatbin" + ) + test_device_functions_fatbin_multi = os.path.join( + TEST_BIN_DIR, "test_device_functions_multi.fatbin" + ) + test_device_functions_o = os.path.join( + TEST_BIN_DIR, "test_device_functions.o" + ) + test_device_functions_ptx = os.path.join( + TEST_BIN_DIR, "test_device_functions.ptx" + ) + test_device_functions_ltoir = os.path.join( + TEST_BIN_DIR, "test_device_functions.ltoir" + ) + # A test function at the module scope to ensure we get the name right for the C # ABI whether a function is at module or local scope. @@ -20,14 +61,33 @@ def f_module(x, y): @skip_on_cudasim("Compilation unsupported in the simulator") class TestCompile(unittest.TestCase): + def _handle_compile_result(self, ret, compile_function): + ptx_or_code_list, resty = ret + if compile_function in (compile_ptx, compile): + ptx = ptx_or_code_list + else: + ptx = ptx_or_code_list[0] + return ptx, resty + def test_global_kernel(self): + with self.subTest("compile_ptx"): + self._test_global_kernel(compile_ptx, {}) + + with self.subTest("compile_all"): + self._test_global_kernel( + compile_all, {"device": False, "abi": "numba", "output": "ptx"} + ) + + def _test_global_kernel(self, compile_function, default_kwargs): def f(r, x, y): i = cuda.grid(1) if i < len(r): r[i] = x[i] + y[i] args = (float32[:], float32[:], float32[:]) - ptx, resty = compile_ptx(f, args) + + ret = compile_function(f, args, **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) # Kernels should not have a func_retval parameter self.assertNotIn("func_retval", ptx) @@ -39,11 +99,22 @@ def f(r, x, y): self.assertEqual(resty, void) def test_device_function(self): + with self.subTest("compile_ptx"): + self._test_device_function(compile_ptx, {"device": True}) + + with self.subTest("compile_all"): + self._test_device_function( + compile_all, {"device": True, "abi": "c", "output": "ptx"} + ) + + def _test_device_function(self, compile_function, default_kwargs): def add(x, y): return x + y args = (float32, float32) - ptx, resty = compile_ptx(add, args, device=True) + + ret = compile_function(add, args, **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) # Device functions take a func_retval parameter for storing the # returned value in by reference @@ -57,33 +128,44 @@ def add(x, y): # Check that function's output matches signature sig_int32 = int32(int32, int32) - ptx, resty = compile_ptx(add, sig_int32, device=True) + ret = compile_function(add, sig_int32, **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.assertEqual(resty, int32) sig_int16 = int16(int16, int16) - ptx, resty = compile_ptx(add, sig_int16, device=True) + ret = compile_function(add, sig_int16, **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.assertEqual(resty, int16) # Using string as signature sig_string = "uint32(uint32, uint32)" - ptx, resty = compile_ptx(add, sig_string, device=True) + ret = compile_function(add, sig_string, **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.assertEqual(resty, uint32) def test_fastmath(self): + with self.subTest("compile_ptx"): + self._test_fastmath(compile_ptx, {"device": True}) + + with self.subTest("compile_all"): + self._test_fastmath(compile_all, {"device": True, "output": "ptx"}) + + def _test_fastmath(self, compile_function, default_kwargs): def f(x, y, z, d): return sqrt((x * y + z) / d) args = (float32, float32, float32, float32) - ptx, resty = compile_ptx(f, args, device=True) # Without fastmath, fma contraction is enabled by default, but ftz and - # approximate div / sqrt is not. + # approximate div / sqrt are not. + ret = compile_function(f, args, **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.assertIn("fma.rn.f32", ptx) self.assertIn("div.rn.f32", ptx) self.assertIn("sqrt.rn.f32", ptx) - ptx, resty = compile_ptx(f, args, device=True, fastmath=True) - # With fastmath, ftz and approximate div / sqrt are enabled + ret = compile_function(f, args, fastmath=True, **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.assertIn("fma.rn.ftz.f32", ptx) self.assertIn("div.approx.ftz.f32", ptx) self.assertIn("sqrt.approx.ftz.f32", ptx) @@ -104,18 +186,59 @@ def test_device_function_with_debug(self): # and NVVM assumed DBG version 1.0 if not specified, which is # incompatible with the 3.0 IR we use. This was specified only for # kernels. + + with self.subTest("compile_ptx"): + self._test_device_function_with_debug( + compile_ptx, {"device": True, "debug": True, "opt": False} + ) + + with self.subTest("compile_all"): + self._test_device_function_with_debug( + compile_all, + { + "device": True, + "debug": True, + "opt": False, + "output": "ptx", + }, + ) + + def _test_device_function_with_debug( + self, compile_function, default_kwargs + ): def f(): pass - ptx, resty = compile_ptx(f, (), device=True, debug=True, opt=False) + ret = compile_function(f, (), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.check_debug_info(ptx) def test_kernel_with_debug(self): # Inspired by (but not originally affected by) Issue #6719 + + with self.subTest("compile_ptx"): + self._test_kernel_with_debug( + compile_ptx, {"debug": True, "opt": False} + ) + + with self.subTest("compile_all"): + self._test_kernel_with_debug( + compile_all, + { + "device": False, + "abi": "numba", + "debug": True, + "opt": False, + "output": "ptx", + }, + ) + + def _test_kernel_with_debug(self, compile_function, default_kwargs): def f(): pass - ptx, resty = compile_ptx(f, (), debug=True, opt=False) + ret = compile_function(f, (), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.check_debug_info(ptx) def check_line_info(self, ptx): @@ -125,52 +248,131 @@ def check_line_info(self, ptx): self.assertRegex(ptx, '\\.file.*test_compiler.py"') def test_device_function_with_line_info(self): + with self.subTest("compile_ptx"): + self._test_device_function_with_line_info( + compile_ptx, {"device": True, "lineinfo": True} + ) + + with self.subTest("compile_all"): + self._test_device_function_with_line_info( + compile_all, + { + "device": True, + "abi": "numba", + "lineinfo": True, + "output": "ptx", + }, + ) + + def _test_device_function_with_line_info( + self, compile_function, default_kwargs + ): def f(): pass - ptx, resty = compile_ptx(f, (), device=True, lineinfo=True) + ret = compile_function(f, (), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.check_line_info(ptx) def test_kernel_with_line_info(self): + with self.subTest("compile_ptx"): + self._test_kernel_with_line_info(compile_ptx, {"lineinfo": True}) + + with self.subTest("compile_all"): + self._test_kernel_with_line_info( + compile_all, + { + "device": False, + "abi": "numba", + "lineinfo": True, + "output": "ptx", + }, + ) + + def _test_kernel_with_line_info(self, compile_function, default_kwargs): def f(): pass - ptx, resty = compile_ptx(f, (), lineinfo=True) + ret = compile_function(f, (), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.check_line_info(ptx) def test_non_void_return_type(self): def f(x, y): return x[0] + y[0] - with self.assertRaisesRegex(TypeError, "must have void return type"): - compile_ptx(f, (uint32[::1], uint32[::1])) + with self.subTest("compile_ptx"): + with self.assertRaisesRegex( + TypeError, "must have void return type" + ): + compile_ptx(f, (uint32[::1], uint32[::1])) + + with self.subTest("compile_all"): + with self.assertRaisesRegex( + TypeError, "must have void return type" + ): + compile_all( + f, + (uint32[::1], uint32[::1]), + device=False, + abi="numba", + output="ptx", + ) def test_c_abi_disallowed_for_kernel(self): def f(x, y): return x + y - with self.assertRaisesRegex( - NotImplementedError, "The C ABI is not supported for kernels" - ): - compile_ptx(f, (int32, int32), abi="c") + with self.subTest("compile_ptx"): + with self.assertRaisesRegex( + NotImplementedError, "The C ABI is not supported for kernels" + ): + compile_ptx(f, (int32, int32), abi="c") + + with self.subTest("compile_all"): + with self.assertRaisesRegex( + NotImplementedError, "The C ABI is not supported for kernels" + ): + compile_all( + f, (int32, int32), abi="c", device=False, output="ptx" + ) def test_unsupported_abi(self): def f(x, y): return x + y - with self.assertRaisesRegex( - NotImplementedError, "Unsupported ABI: fastcall" - ): - compile_ptx(f, (int32, int32), abi="fastcall") + with self.subTest("compile_ptx"): + with self.assertRaisesRegex( + NotImplementedError, "Unsupported ABI: fastcall" + ): + compile_ptx(f, (int32, int32), abi="fastcall") + + with self.subTest("compile_all"): + with self.assertRaisesRegex( + NotImplementedError, "Unsupported ABI: fastcall" + ): + compile_all(f, (int32, int32), abi="fastcall", output="ptx") def test_c_abi_device_function(self): + with self.subTest("compile_ptx"): + self._test_c_abi_device_function( + compile_ptx, {"device": True, "abi": "c"} + ) + + with self.subTest("compile_all"): + self._test_c_abi_device_function( + compile_all, {"device": True, "abi": "c", "output": "ptx"} + ) + + def _test_c_abi_device_function(self, compile_function, default_kwargs): def f(x, y): return x + y - ptx, resty = compile_ptx(f, int32(int32, int32), device=True, abi="c") + # 32-bit signature + ret = compile_function(f, int32(int32, int32), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) # There should be no more than two parameters self.assertNotIn(ptx, "param_2") - # The function name should match the Python function name (not the # qualname, which includes additional info), and its return value # should be 32 bits @@ -180,15 +382,28 @@ def f(x, y): r"func_retval0\)\s+f\(", ) - # If we compile for 64-bit integers, the return type should be 64 bits - # wide - ptx, resty = compile_ptx(f, int64(int64, int64), device=True, abi="c") + # 64-bit signature should produce 64-bit return parameter + ret = compile_function(f, int64(int64, int64), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64") def test_c_abi_device_function_module_scope(self): - ptx, resty = compile_ptx( - f_module, int32(int32, int32), device=True, abi="c" - ) + with self.subTest("compile_ptx"): + self._test_c_abi_device_function_module_scope( + compile_ptx, {"device": True, "abi": "c"} + ) + + with self.subTest("compile_all"): + self._test_c_abi_device_function_module_scope( + compile_all, + {"device": True, "abi": "c", "output": "ptx"}, + ) + + def _test_c_abi_device_function_module_scope( + self, compile_function, default_kwargs + ): + ret = compile_function(f_module, int32(int32, int32), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) # The function name should match the Python function name, and its # return value should be 32 bits @@ -200,13 +415,27 @@ def test_c_abi_device_function_module_scope(self): def test_c_abi_with_abi_name(self): abi_info = {"abi_name": "_Z4funcii"} - ptx, resty = compile_ptx( - f_module, - int32(int32, int32), - device=True, - abi="c", - abi_info=abi_info, - ) + + with self.subTest("compile_ptx"): + self._test_c_abi_with_abi_name( + compile_ptx, + {"device": True, "abi": "c", "abi_info": abi_info}, + ) + + with self.subTest("compile_all"): + self._test_c_abi_with_abi_name( + compile_all, + { + "device": True, + "abi": "c", + "abi_info": abi_info, + "output": "ptx", + }, + ) + + def _test_c_abi_with_abi_name(self, compile_function, default_kwargs): + ret = compile_function(f_module, int32(int32, int32), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) # The function name should match the one given in the ABI info, and its # return value should be 32 bits @@ -217,7 +446,18 @@ def test_c_abi_with_abi_name(self): ) def test_compile_defaults_to_c_abi(self): - ptx, resty = compile(f_module, int32(int32, int32), device=True) + with self.subTest("compile"): + self._test_compile_defaults_to_c_abi(compile, {"device": True}) + + with self.subTest("compile_all"): + self._test_compile_defaults_to_c_abi( + compile_all, + {"device": True, "output": "ptx"}, + ) + + def _test_compile_defaults_to_c_abi(self, compile_function, default_kwargs): + ret = compile_function(f_module, int32(int32, int32), **default_kwargs) + ptx, resty = self._handle_compile_result(ret, compile_function) # The function name should match the Python function name, and its # return value should be 32 bits @@ -228,28 +468,50 @@ def test_compile_defaults_to_c_abi(self): ) def test_compile_to_ltoir(self): - ltoir, resty = compile( - f_module, int32(int32, int32), device=True, output="ltoir" - ) + with self.subTest("compile"): + self._test_compile_to_ltoir( + compile, {"device": True, "output": "ltoir"} + ) + + with self.subTest("compile_all"): + self._test_compile_to_ltoir( + compile_all, + {"device": True, "abi": "c", "output": "ltoir"}, + ) + + def _test_compile_to_ltoir(self, compile_function, default_kwargs): + ret = compile_function(f_module, int32(int32, int32), **default_kwargs) + code, resty = self._handle_compile_result(ret, compile_function) # There are no tools to interpret the LTOIR output, but we can check # that we appear to have obtained an LTOIR file. This magic number is # not documented, but is expected to remain consistent. LTOIR_MAGIC = 0x7F4E43ED - header = int.from_bytes(ltoir[:4], byteorder="little") + header = int.from_bytes(code[:4], byteorder="little") self.assertEqual(header, LTOIR_MAGIC) self.assertEqual(resty, int32) def test_compile_to_invalid_error(self): illegal_output = "illegal" msg = f"Unsupported output type: {illegal_output}" - with self.assertRaisesRegex(NotImplementedError, msg): - compile( - f_module, - int32(int32, int32), - device=True, - output=illegal_output, - ) + with self.subTest("compile"): + with self.assertRaisesRegex(NotImplementedError, msg): + compile( + f_module, + int32(int32, int32), + device=True, + output=illegal_output, + ) + + with self.subTest("compile_all"): + with self.assertRaisesRegex(NotImplementedError, msg): + compile_all( + f_module, + int32(int32, int32), + device=True, + abi="c", + output=illegal_output, + ) def test_functioncompiler_locals(self): # Tests against regression fixed in: @@ -266,6 +528,89 @@ def f(b_arg): if cond: b_smem[0] = b_arg[0] + @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.") + def test_compile_all_with_external_functions(self): + for link in [ + test_device_functions_a, + test_device_functions_cubin, + test_device_functions_cu, + test_device_functions_fatbin, + test_device_functions_fatbin_multi, + test_device_functions_o, + test_device_functions_ptx, + test_device_functions_ltoir, + ]: + with self.subTest(link=link): + add = cuda.declare_device( + "add_from_numba", "uint32(uint32, uint32)", link=[link] + ) + + def f(z, x, y): + z[0] = add(x, y) + + code_list, resty = compile_all( + f, (uint32[::1], uint32, uint32), device=False, abi="numba" + ) + + assert resty == void + assert len(code_list) == 2 + link_obj = LinkableCode.from_path(link) + if link_obj.kind == "cu": + # if link is a cu file, result contains a compiled object code + if cuda.config.CUDA_USE_NVIDIA_BINDING: + from cuda.core.experimental import ObjectCode + + assert isinstance(code_list[1], ObjectCode) + else: + assert isinstance(code_list[1], bytes) + else: + assert code_list[1].kind == link_obj.kind + + @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.") + def test_compile_all_lineinfo(self): + add = cuda.declare_device( + "add", "float32(float32, float32)", link=[test_device_functions_cu] + ) + + def f(z, x, y): + z[0] = add(x, y) + + args = (float32[::1], float32, float32) + code_list, resty = compile_all( + f, args, lineinfo=True, output="ptx", device=False, abi="numba" + ) + assert len(code_list) == 2 + + if cuda.config.CUDA_USE_NVIDIA_BINDING: + self.assertRegex( + str(code_list[1].code.decode()), + r"\.file.*test_device_functions", + ) + else: + self.assertRegex(code_list[1], r"\.file.*test_device_functions") + + @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.") + def test_compile_all_debug(self): + add = cuda.declare_device( + "add", "float32(float32, float32)", link=[test_device_functions_cu] + ) + + def f(z, x, y): + z[0] = add(x, y) + + args = (float32[::1], float32, float32) + code_list, resty = compile_all( + f, args, debug=True, output="ptx", device=False, abi="numba" + ) + assert len(code_list) == 2 + + if cuda.config.CUDA_USE_NVIDIA_BINDING: + self.assertRegex( + str(code_list[1].code.decode()), r"\.section\s+\.debug_info" + ) + else: + self.assertRegex(code_list[1], r"\.section\s+\.debug_info") + @skip_on_cudasim("Compilation unsupported in the simulator") class TestCompileForCurrentDevice(CUDATestCase):