Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions numba_cuda/numba/cuda/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def compile_cuda(
args,
debug=False,
lineinfo=False,
inline=False,
forceinline=False,
fastmath=False,
nvvm_options=None,
cc=None,
Expand Down Expand Up @@ -316,7 +316,7 @@ def compile_cuda(
else:
flags.error_model = "numpy"

if inline:
if forceinline:
flags.forceinline = True
if fastmath:
flags.fastmath = True
Expand Down Expand Up @@ -574,6 +574,7 @@ def compile(
abi="c",
abi_info=None,
output="ptx",
forceinline=False,
):
"""Compile a Python function to PTX or LTO-IR for a given set of argument
types.
Expand Down Expand Up @@ -614,6 +615,11 @@ def compile(
:type abi_info: dict
:param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
:type output: str
:param forceinline: Enables inlining at the NVVM IR level when set to
``True``. This is accomplished by adding the
``alwaysinline`` function attribute to the function
definition. This is only valid when the output is
``"ltoir"``.
:return: (code, resty): The compiled code and inferred return type
:rtype: tuple
"""
Expand All @@ -626,6 +632,12 @@ def compile(
if output not in ("ptx", "ltoir"):
raise NotImplementedError(f"Unsupported output type: {output}")

if forceinline and not device:
raise ValueError("Cannot force-inline kernels")

if forceinline and output != "ltoir":
raise ValueError("Can only designate forced inlining in LTO-IR")

debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
opt = (config.OPT != 0) if opt is None else opt

Expand Down Expand Up @@ -660,6 +672,7 @@ def compile(
fastmath=fastmath,
nvvm_options=nvvm_options,
cc=cc,
forceinline=forceinline,
)
resty = cres.signature.return_type

Expand Down Expand Up @@ -699,6 +712,7 @@ def compile_for_current_device(
abi="c",
abi_info=None,
output="ptx",
forceinline=False,
):
"""Compile a Python function to PTX or LTO-IR for a given signature for the
current device's compute capabilility. This calls :func:`compile` with an
Expand All @@ -716,6 +730,7 @@ def compile_for_current_device(
abi=abi,
abi_info=abi_info,
output=output,
forceinline=forceinline,
)


Expand All @@ -730,6 +745,7 @@ def compile_ptx(
opt=None,
abi="numba",
abi_info=None,
forceinline=False,
):
"""Compile a Python function to PTX for a given signature. See
:func:`compile`. The defaults for this function are to compile a kernel
Expand All @@ -747,6 +763,7 @@ def compile_ptx(
abi=abi,
abi_info=abi_info,
output="ptx",
forceinline=forceinline,
)


Expand All @@ -760,6 +777,7 @@ def compile_ptx_for_current_device(
opt=None,
abi="numba",
abi_info=None,
forceinline=False,
):
"""Compile a Python function to PTX for a given signature for the current
device's compute capabilility. See :func:`compile_ptx`."""
Expand All @@ -775,6 +793,7 @@ def compile_ptx_for_current_device(
opt=opt,
abi=abi,
abi_info=abi_info,
forceinline=forceinline,
)


Expand Down
16 changes: 15 additions & 1 deletion numba_cuda/numba/cuda/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def jit(
func_or_sig=None,
device=False,
inline="never",
forceinline=False,
link=[],
debug=None,
opt=None,
Expand All @@ -39,6 +40,14 @@ def jit(
.. note:: A kernel cannot have any return value.
:param device: Indicates whether this is a device function.
:type device: bool
:param inline: Enables inlining at the Numba IR level when set to
``"always"``. See `Notes on Inlining
<https://numba.readthedocs.io/en/stable/developer/inlining.html>`_.
:type inline: str
:param forceinline: Enables inlining at the NVVM IR level when set to
``True``. This is accomplished by adding the ``alwaysinline`` function
attribute to the function definition.
:type forceinline: bool
:param link: A list of files containing PTX or CUDA C/C++ source to link
with the function
:type link: list
Expand Down Expand Up @@ -85,7 +94,9 @@ def jit(
DeprecationWarning(
"Passing bool to inline argument is deprecated, please refer to "
"Numba's documentation on inlining: "
"https://numba.readthedocs.io/en/stable/developer/inlining.html"
"https://numba.readthedocs.io/en/stable/developer/inlining.html. "
"You may have wanted the forceinline argument instead, to force "
"inlining at the NVVM IR level."
)

inline = "always" if inline else "never"
Expand Down Expand Up @@ -140,6 +151,7 @@ def _jit(func):
targetoptions["fastmath"] = fastmath
targetoptions["device"] = device
targetoptions["inline"] = inline
targetoptions["forceinline"] = forceinline
targetoptions["extensions"] = extensions

disp = CUDADispatcher(func, targetoptions=targetoptions)
Expand Down Expand Up @@ -182,6 +194,7 @@ def autojitwrapper(func):
func,
device=device,
inline=inline,
forceinline=forceinline,
debug=debug,
opt=opt,
lineinfo=lineinfo,
Expand All @@ -206,6 +219,7 @@ def autojitwrapper(func):
targetoptions["fastmath"] = fastmath
targetoptions["device"] = device
targetoptions["inline"] = inline
targetoptions["forceinline"] = forceinline
targetoptions["extensions"] = extensions
disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)

Expand Down
7 changes: 4 additions & 3 deletions numba_cuda/numba/cuda/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def __init__(
debug=False,
lineinfo=False,
inline=False,
forceinline=False,
fastmath=False,
extensions=None,
max_registers=None,
Expand Down Expand Up @@ -182,7 +183,7 @@ def __init__(
self.argtypes,
debug=self.debug,
lineinfo=lineinfo,
inline=inline,
forceinline=forceinline,
fastmath=fastmath,
nvvm_options=nvvm_options,
cc=cc,
Expand Down Expand Up @@ -1073,7 +1074,7 @@ def compile_device(self, args, return_type=None):
with self._compiling_counter:
debug = self.targetoptions.get("debug")
lineinfo = self.targetoptions.get("lineinfo")
inline = self.targetoptions.get("inline")
forceinline = self.targetoptions.get("forceinline")
fastmath = self.targetoptions.get("fastmath")

nvvm_options = {
Expand All @@ -1091,7 +1092,7 @@ def compile_device(self, args, return_type=None):
args,
debug=debug,
lineinfo=lineinfo,
inline=inline,
forceinline=forceinline,
fastmath=fastmath,
nvvm_options=nvvm_options,
cc=cc,
Expand Down
99 changes: 98 additions & 1 deletion numba_cuda/numba/cuda/tests/cudapy/test_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
)


@skip_on_cudasim("Cudasim does not support inline and forceinline")
class TestCudaInline(CUDATestCase):
@skip_on_cudasim("Cudasim does not support inline")
def _test_call_inline(self, inline):
"""Test @cuda.jit(inline=...)"""
a = np.ones(2, dtype=np.int32)
Expand Down Expand Up @@ -42,6 +42,9 @@ def call_set_zero(a):
# check that call was not inlined
self.assertIsNotNone(match, msg=llvm_ir)

# alwaysinline should not be in the IR when the inline kwarg is used
self.assertNotIn("alwaysinline", llvm_ir)

def test_call_inline_always(self):
self._test_call_inline("always")

Expand All @@ -54,6 +57,100 @@ def test_call_inline_true(self):
def test_call_inline_false(self):
self._test_call_inline(False)

def _test_call_forceinline(self, forceinline):
"""Test @cuda.jit(forceinline=...)"""
a = np.ones(2, dtype=np.int32)

sig = (types.int32[::1],)

@cuda.jit(forceinline=forceinline)
def set_zero(a):
a[0] = 0

@cuda.jit(sig)
def call_set_zero(a):
set_zero(a)

call_set_zero[1, 2](a)

expected = np.arange(2, dtype=np.int32)
self.assertTrue(np.all(a == expected))

llvm_ir = call_set_zero.inspect_llvm(sig)
pat = r"call [a-zA-Z0-9]* @"
match = re.compile(pat).search(llvm_ir)

# Check that call was not inlined at the Numba IR level - the call
# should still be present in the IR
self.assertIsNotNone(match)

# Check the definition of set_zero - it is a definition where the
# name does not include an underscore just before "set_zero", because
# that would match the "call_set_zero" definition
pat = r"define.*[^_]set_zero.*"
match = re.compile(pat).search(llvm_ir)
self.assertIsNotNone(match)
if forceinline:
self.assertIn("alwaysinline", match.group())
else:
self.assertNotIn("alwaysinline", match.group())

# The kernel, "call_set_zero", should never have "alwaysinline" set
pat = r"define.*call_set_zero.*"
match = re.compile(pat).search(llvm_ir)
self.assertIsNotNone(match)
self.assertNotIn("alwaysinline", match.group())

def test_call_forceinline_true(self):
self._test_call_forceinline(True)

def test_call_forceinline_false(self):
self._test_call_forceinline(False)

def test_compile_forceinline_ltoir_only(self):
def set_zero(a):
a[0] = 0

args = (types.float32[::1],)
msg = r"Can only designate forced inlining in LTO-IR"
with self.assertRaisesRegex(ValueError, msg):
cuda.compile(
set_zero,
args,
device=True,
forceinline=True,
)

def _compile_set_zero(self, forceinline):
def set_zero(a):
a[0] = 0

args = (types.float32[::1],)
ltoir, resty = cuda.compile(
set_zero,
args,
device=True,
output="ltoir",
forceinline=forceinline,
)

# Sanity check
self.assertEqual(resty, types.none)

return ltoir

def test_compile_forceinline(self):
ltoir_noinline = self._compile_set_zero(False)
ltoir_forceinline = self._compile_set_zero(True)

# As LTO-IR is opaque, the best we can do is check that changing the
# flag resulted in a change in the generated LTO-IR in some way.
self.assertNotEqual(
ltoir_noinline,
ltoir_forceinline,
"forceinline flag appeared to have no effect on LTO-IR",
)


if __name__ == "__main__":
unittest.main()