Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4d82e84
checkpointing
isVoid Sep 4, 2025
a126b51
Merge branch 'main' of github.com:NVIDIA/numba-cuda into fea-compile-all
isVoid Sep 4, 2025
fc21ef0
another checkpointing
isVoid Sep 4, 2025
dbc6c38
add from_path in linkable code test
isVoid Sep 9, 2025
97712f5
initial pass on implementing compile_all and tests
isVoid Sep 19, 2025
79d2ab7
add resty assertion
isVoid Sep 19, 2025
2cf6b6c
align compile_all arguments to compile_ptx
isVoid Sep 19, 2025
a318b5d
add lineinfo and debug test
isVoid Sep 20, 2025
0c14dde
fix test case with ctypes binding
isVoid Sep 23, 2025
42cdccf
align arguments with `compile`
isVoid Sep 23, 2025
f778db9
Merge branch 'main' into fea-compile-all
isVoid Sep 23, 2025
edae868
conditionally import cuda.core in tests
isVoid Sep 23, 2025
48548e3
skip linkable code
isVoid Sep 23, 2025
792e8d7
skipping tests that requires pregenerated binaries
isVoid Sep 26, 2025
3813c0a
rename lto var name
isVoid Sep 26, 2025
67ec2fa
add linkable code API docs
isVoid Sep 26, 2025
77f0e85
Merge branch 'main' into fea-compile-all
isVoid Sep 26, 2025
e69affe
update to cuda config registry
isVoid Sep 26, 2025
5408f2f
add compile_all to simulator
isVoid Sep 29, 2025
b1e51bf
expose linkable code in simulator
isVoid Sep 29, 2025
bf4a5f2
Merge branch 'main' into fea-compile-all
isVoid Sep 30, 2025
afa8959
docstring update, remove unused code
isVoid Oct 2, 2025
851f39b
update test_device_function to reduce redundancy
isVoid Oct 2, 2025
90287fe
update test_fastmath and test_global_kernel
isVoid Oct 2, 2025
4b31dbc
update test_kernel/device_function_with_debug
isVoid Oct 2, 2025
4a4baf6
update lineinfo related tests
isVoid Oct 2, 2025
1a04d0b
update cabi, ltoir tests
isVoid Oct 2, 2025
9650d94
Merge branch 'main' into fea-compile-all
gmarkall Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/reference/host.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ any of the driver API. This can be useful for:

.. autofunction:: numba.cuda.compile

.. autofunction:: numba.cuda.compile_all


The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control
the default compute capability targeted by ``compile`` - see
Expand Down
1 change: 1 addition & 0 deletions numba_cuda/numba/cuda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
compile_for_current_device,
compile_ptx,
compile_ptx_for_current_device,
compile_all,
)

# This is the out-of-tree NVIDIA-maintained target. This is reported in Numba
Expand Down
3 changes: 3 additions & 0 deletions numba_cuda/numba/cuda/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ def _reduce_states(self):
needs_cudadevrt=self.needs_cudadevrt,
nrt=nrt,
use_cooperative=self.use_cooperative,
lto=self._lto,
)

@classmethod
Expand All @@ -488,6 +489,7 @@ def _rebuild(
needs_cudadevrt,
nrt,
use_cooperative,
lto,
):
"""
Rebuild an instance.
Expand All @@ -508,6 +510,7 @@ def _rebuild(
if nrt:
instance._linking_files = {NRT_LIBRARY}

instance._lto = lto
return instance


Expand Down
251 changes: 189 additions & 62 deletions numba_cuda/numba/cuda/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
funcdesc,
)
from numba.cuda.cudadrv import nvvm, nvrtc
from numba.cuda.cudadrv.linkable_code import LinkableCode
from numba.cuda.descriptor import cuda_target
from numba.cuda.flags import CUDAFlags
from numba.cuda.target import CUDACABICallConv
Expand Down Expand Up @@ -977,6 +978,175 @@ def define_error_gv(postfix):
return helper_func


def compile_all(
pyfunc,
sig,
debug=None,
lineinfo=False,
device=True,
fastmath=False,
cc=None,
opt=None,
abi="c",
abi_info=None,
output="ltoir",
forceinline=False,
launch_bounds=None,
):
"""Similar to ``compile()``, but returns a list of PTX codes/LTO-IRs for
the compiled function and the external functions it depends on.
If external functions are CUDA C++ source, they will be compiled with
NVRTC. Other kinds of external function code (e.g. cubins, fatbins, etc.)
will be added directly to the return list. The output code kind is
determined by the ``output`` parameter, and defaults to ``"ltoir"``.
"""

if output not in ("ptx", "ltoir"):
raise NotImplementedError(f"Unsupported output type: {output}")

if forceinline and output != "ltoir":
raise ValueError("Can only designate forced inlining in LTO-IR")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to be a hard error? Is forceinline a guarantee or can the compiler still ignore it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I imagine this is because there's no way to designate the inline information when you specify output as PTX. The information can only persist via LTOIR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's correct - you can't inline PTX. This looks like new code on the diff, but this PR is only really moving it from the original compile() implementation.


lto = output == "ltoir"

cc = _default_cc(cc)

lib, resty = _compile_pyfunc_with_fixup(
pyfunc,
sig,
debug=debug,
lineinfo=lineinfo,
device=device,
fastmath=fastmath,
cc=cc,
opt=opt,
abi=abi,
abi_info=abi_info,
forceinline=forceinline,
launch_bounds=launch_bounds,
lto=lto,
)

if lto:
code = lib.get_ltoir(cc=cc)
else:
code = lib.get_asm_str(cc=cc)
codes = [code]

# linking_files
is_ltoir = output == "ltoir"
for path_or_obj in lib._linking_files:
obj = LinkableCode.from_path_or_obj(path_or_obj)
Copy link
Contributor

@rparolin rparolin Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable is called path_or_obj but the code appears to handle cu and obj use cases. Should the variable be renamed or are you missing the path handling use case?

Copy link
Contributor Author

@isVoid isVoid Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The objects in lib._linking_files can be arbitrary linkable code object or paths, among which only cuda source files require compilation before it's feedable to the linker. (That's the implied assumption of return objects for compile_all, everything returning from the API should be passable to linker without additional processing). Therefore we are special casing below, compile them with nvrtc before returning it to user.

if obj.kind == "cu":
code, log = nvrtc.compile(
obj.data,
obj.name,
cc,
ltoir=is_ltoir,
lineinfo=lineinfo,
debug=debug,
)
codes.append(code)
else:
codes.append(obj)

return codes, resty


def _compile_pyfunc_with_fixup(
pyfunc,
sig,
debug=None,
lineinfo=False,
device=True,
fastmath=False,
cc=None,
opt=None,
abi="c",
abi_info=None,
forceinline=False,
launch_bounds=None,
lto=False,
):
"""Internal method to compile a python function and perform post-processing

- If pyfunc is a kernel, post-processing includes kernel fixup and setting
launch bounds.
- If pyfunc is a device function, post-processing includes ABI wrapper.

`lto` means that all internal pipeline options use LTO.

Returns the code library and return type.
"""
if abi not in ("numba", "c"):
raise NotImplementedError(f"Unsupported ABI: {abi}")

if abi == "c" and not device:
raise NotImplementedError("The C ABI is not supported for kernels")

if forceinline and not device:
raise ValueError("Cannot force-inline kernels")

debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
opt = (config.OPT != 0) if opt is None else opt

if debug and opt:
msg = (
"debug=True with opt=True "
"is not supported by CUDA. This may result in a crash"
" - set debug=False or opt=False."
)
warn(NumbaInvalidConfigWarning(msg))

abi_info = abi_info or dict()

nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}

if debug:
nvvm_options["g"] = None

if lto:
nvvm_options["gen-lto"] = None

args, return_type = sigutils.normalize_signature(sig)

cc = _default_cc(cc)

cres = compile_cuda(
pyfunc,
return_type,
args,
debug=debug,
lineinfo=lineinfo,
fastmath=fastmath,
nvvm_options=nvvm_options,
cc=cc,
forceinline=forceinline,
)
resty = cres.signature.return_type

if resty and not device and resty != types.void:
raise TypeError("CUDA kernel must have void return type.")

tgt = cres.target_context

if device:
lib = cres.library
if abi == "c":
wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
lib = cabi_wrap_function(
tgt, lib, cres.fndesc, wrapper_name, nvvm_options
)
else:
lib = cres.library
kernel = lib.get_function(cres.fndesc.llvm_func_name)
lib._entry_name = cres.fndesc.llvm_func_name
kernel_fixup(kernel, debug)
nvvm.set_launch_bounds(kernel, launch_bounds)

return lib, resty


@global_compiler_lock
def compile(
pyfunc,
Expand Down Expand Up @@ -1050,82 +1220,28 @@ def compile(
:return: (code, resty): The compiled code and inferred return type
:rtype: tuple
"""
if abi not in ("numba", "c"):
raise NotImplementedError(f"Unsupported ABI: {abi}")

if abi == "c" and not device:
raise NotImplementedError("The C ABI is not supported for kernels")

if output not in ("ptx", "ltoir"):
raise NotImplementedError(f"Unsupported output type: {output}")

if forceinline and not device:
raise ValueError("Cannot force-inline kernels")

if forceinline and output != "ltoir":
raise ValueError("Can only designate forced inlining in LTO-IR")

debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
opt = (config.OPT != 0) if opt is None else opt

if debug and opt:
msg = (
"debug=True with opt=True "
"is not supported by CUDA. This may result in a crash"
" - set debug=False or opt=False."
)
warn(NumbaInvalidConfigWarning(msg))

lto = output == "ltoir"
abi_info = abi_info or dict()

nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}

if debug:
nvvm_options["g"] = None

if lto:
nvvm_options["gen-lto"] = None

args, return_type = sigutils.normalize_signature(sig)

# If the user has used the config variable to specify a non-default that is
# greater than the lowest non-deprecated one, then we should default to
# their specified CC instead of the lowest non-deprecated one.
MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())
cc = cc or MIN_CC

cres = compile_cuda(
lib, resty = _compile_pyfunc_with_fixup(
pyfunc,
return_type,
args,
sig,
debug=debug,
lineinfo=lineinfo,
device=device,
fastmath=fastmath,
nvvm_options=nvvm_options,
cc=cc,
opt=opt,
abi=abi,
abi_info=abi_info,
forceinline=forceinline,
launch_bounds=launch_bounds,
lto=lto,
)
resty = cres.signature.return_type

if resty and not device and resty != types.void:
raise TypeError("CUDA kernel must have void return type.")

tgt = cres.target_context

if device:
lib = cres.library
if abi == "c":
wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
lib = cabi_wrap_function(
tgt, lib, cres.fndesc, wrapper_name, nvvm_options
)
else:
lib = cres.library
kernel = lib.get_function(cres.fndesc.llvm_func_name)
lib._entry_name = cres.fndesc.llvm_func_name
kernel_fixup(kernel, debug)
nvvm.set_launch_bounds(kernel, launch_bounds)

if lto:
code = lib.get_ltoir(cc=cc)
Expand Down Expand Up @@ -1272,3 +1388,14 @@ class ExternFunction:
def __init__(self, name, sig):
self.name = name
self.sig = sig


def _default_cc(cc):
"""
Return default compute capability based on config and nvrtc lowest supported cc.

If user specifies a cc, return that.
"""
if cc:
return cc
return max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())
Loading