Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions numba_cuda/numba/cuda/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,9 @@ def _ensure_cc(self, cc):
return device.compute_capability

def get_asm_str(self, cc=None):
return "\n".join(self.get_asm_strs(cc=cc))

def get_asm_strs(self, cc=None):
cc = self._ensure_cc(cc)

ptxes = self._ptx_cache.get(cc, None)
Expand All @@ -218,21 +221,25 @@ def get_asm_str(self, cc=None):

irs = self.llvm_strs

ptx = nvvm.compile_ir(irs, **options)
if "g" in options:
ptxes = [nvvm.compile_ir(ir, **options) for ir in irs]
else:
ptxes = [nvvm.compile_ir(irs, **options)]

# Sometimes the result from NVVM contains trailing whitespace and
# nulls, which we strip so that the assembly dump looks a little
# tidier.
ptx = ptx.decode().strip("\x00").strip()
ptxes = [ptx.decode().strip("\x00").strip() for ptx in ptxes]

if config.DUMP_ASSEMBLY:
print(("ASSEMBLY %s" % self._name).center(80, "-"))
print(ptx)
for ptx in ptxes:
print(ptx)
print("=" * 80)

self._ptx_cache[cc] = ptx
self._ptx_cache[cc] = ptxes

return ptx
return ptxes

def get_lto_ptx(self, cc=None):
"""
Expand Down Expand Up @@ -284,8 +291,9 @@ def _link_all(self, linker, cc, ignore_nonlto=False):
ltoir = self.get_ltoir(cc=cc)
linker.add_ltoir(ltoir)
else:
ptx = self.get_asm_str(cc=cc)
linker.add_ptx(ptx.encode())
ptxes = self.get_asm_strs(cc=cc)
for ptx in ptxes:
linker.add_ptx(ptx.encode())

for path in self._linking_files:
linker.add_file_guess_ext(path, ignore_nonlto)
Expand Down Expand Up @@ -432,7 +440,10 @@ def finalize(self):
for mod in library.modules:
for fn in mod.functions:
if not fn.is_declaration:
fn.linkage = "linkonce_odr"
if "g" in self._nvvm_options:
fn.linkage = "weak_odr"
else:
fn.linkage = "linkonce_odr"

self._finalized = True

Expand Down
14 changes: 10 additions & 4 deletions numba_cuda/numba/cuda/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,10 +1023,9 @@ def compile_all(
)

if lto:
code = lib.get_ltoir(cc=cc)
codes = [lib.get_ltoir(cc=cc)]
else:
code = lib.get_asm_str(cc=cc)
codes = [code]
codes = lib.get_asm_strs(cc=cc)

# linking_files
is_ltoir = output == "ltoir"
Expand Down Expand Up @@ -1241,7 +1240,14 @@ def compile(
if lto:
code = lib.get_ltoir(cc=cc)
else:
code = lib.get_asm_str(cc=cc)
codes = lib.get_asm_strs(cc=cc)
if len(codes) == 1:
code = codes[0]
else:
raise RuntimeError(
"Compiling this function results in multiple "
"PTX files. Use compile_all() instead"
)
return code, resty


Expand Down
10 changes: 10 additions & 0 deletions numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,16 @@ def check_debug_info(self, ptx):
# ending in the filename of this module.
self.assertRegex(ptx, '\\.file.*test_compiler.py"')

# We did test for the presence of debuginfo here, but in practice it made
# no sense - the C ABI wrapper generates a call instruction that has
# nothing to correlate with the DWARF, so it would confuse the debugger
# immediately anyway. With the resolution of Issue #588 (using separate
# translation of each IR module when debuginfo is enabled) the debuginfo
# isn't even produced for the ABI wrapper, because there was none present
# in that module anyway. So this test can only be expected to fail until we
# have a proper way of generating device functions with the C ABI without
# requiring the hack of generating a wrapper.
@unittest.expectedFailure
def test_device_function_with_debug(self):
# See Issue #6719 - this ensures that compilation with debug succeeds
# with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
Expand Down
Loading