diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 40a12ee9a..c5073a7bb 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2312,18 +2312,9 @@ def __init__( self.lineinfo = lineinfo self.cc = cc self.arch = arch - if lto is False: - # WAR for apparent nvjitlink issue - lto = None self.lto = lto self.additional_flags = additional_flags - - self.options = LinkerOptions( - max_register_count=self.max_registers, - lineinfo=lineinfo, - arch=arch, - link_time_optimization=lto, - ) + self._has_ltoir = False self._complete = False self._object_codes = [] self.linker = None # need at least one program @@ -2446,6 +2437,8 @@ def add_cu(self, cu, name=""): print(obj.code) self._object_codes.append(obj) + if self.lto: + self._has_ltoir = True def add_cubin(self, cubin, name=""): obj = ObjectCode.from_cubin(cubin, name=name) @@ -2454,6 +2447,7 @@ def add_cubin(self, cubin, name=""): def add_ltoir(self, ltoir, name=""): obj = ObjectCode.from_ltoir(ltoir, name=name) self._object_codes.append(obj) + self._has_ltoir = True def add_fatbin(self, fatbin, name=""): obj = ObjectCode.from_fatbin(fatbin, name=name) @@ -2499,15 +2493,23 @@ def add_data(self, data, kind, name): fn(data, name) - def get_linked_ptx(self): + def _get_linker_options(self, ptx): + # Some linker flags are only valid/required if LTOIR object code is present. + # WAR for cuda-core < 0.4.0 where passing False incorrectly appends flags + # (fixed in cuda-python PR #989, released in cuda-core v0.4.0) + lto_flag = True if self._has_ltoir else None + ptx_flag = True if (self._has_ltoir and ptx) else None options = LinkerOptions( max_register_count=self.max_registers, lineinfo=self.lineinfo, arch=self.arch, - link_time_optimization=True, - ptx=True, + link_time_optimization=lto_flag, + ptx=ptx_flag, ) + return options + def get_linked_ptx(self): + options = self._get_linker_options(ptx=True) self.linker = Linker(*self._object_codes, options=options) result = self.linker.link("ptx") @@ -2526,7 +2528,8 @@ def complete(self): cubin is a pointer to a internal buffer of cubin owned by the linker; thus, it should be loaded before the linker is destroyed. """ - self.linker = Linker(*self._object_codes, options=self.options) + options = self._get_linker_options(ptx=False) + self.linker = Linker(*self._object_codes, options=options) result = self.linker.link("cubin") self.close() self._complete = True diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py index 23124ee06..2ca7493e4 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: BSD-2-Clause import numpy as np +import os import pytest from numba.cuda.testing import unittest from numba.cuda.testing import ( @@ -109,6 +110,25 @@ def simple_lmem(A, B, dty): B[i] = C[i] +TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR") +if TEST_BIN_DIR: + test_device_functions_ltoir = os.path.join( + TEST_BIN_DIR, "test_device_functions.ltoir" + ) + + +add_from_numba = cuda.declare_device( + "add_from_numba", + "int32(int32, int32)", + link=[test_device_functions_ltoir], +) + + +def debuggable_kernel(result): + i = cuda.grid(1) + result[i] = add_from_numba(i, i) + + @skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): @require_context @@ -330,6 +350,9 @@ def test_get_local_mem_per_specialized(self): calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE self.assertGreaterEqual(local_mem_size, calc_size) + def test_debug_kernel_with_lto(self): + cuda.jit("void(int32[::1])", debug=True, opt=False)(debuggable_kernel) + @skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)") def test_link_for_different_cc(self): linker = _Linker(max_registers=0, cc=(7, 5), lto=True)