diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ea7e7a98b..20efc1f6d 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -19,11 +19,11 @@ jobs: - compute-matrix - build-conda - test-conda - - test-conda-pynvjitlink + - test-conda-ctypes-binding - test-simulator - build-wheels - test-wheels - - test-wheels-pynvjitlink + - test-wheels-ctypes-binding - test-wheels-deps-wheels - build-docs secrets: inherit @@ -76,14 +76,14 @@ jobs: script: "ci/test_conda.sh" run_codecov: false matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - test-conda-pynvjitlink: + test-conda-ctypes-binding: needs: - build-conda - compute-matrix uses: ./.github/workflows/conda-python-tests.yaml with: build_type: pull-request - script: "ci/test_conda_pynvjitlink.sh" + script: "ci/test_conda_ctypes_binding.sh" run_codecov: false # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version". matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} @@ -114,14 +114,14 @@ jobs: build_type: pull-request script: "ci/test_wheel.sh false" matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - test-wheels-pynvjitlink: + test-wheels-ctypes-binding: needs: - build-wheels - compute-matrix uses: ./.github/workflows/wheels-test.yaml with: build_type: pull-request - script: "ci/test_wheel_pynvjitlink.sh" + script: "ci/test_wheel_ctypes_binding.sh" # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version". matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | split(".") | .[0] | tonumber >= 12))) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 2fbc53b71..04958da67 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -8,7 +8,7 @@ set -euo pipefail if [ "${CUDA_VER%.*.*}" = "11" ]; then CTK_PACKAGES="cudatoolkit=11" else - CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc libcurand-dev" + CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc libcurand-dev cuda-cuobjdump" apt-get update apt remove --purge `dpkg --get-selections | grep cuda-nvvm | awk '{print $1}'` -y apt remove --purge `dpkg --get-selections | grep cuda-nvrtc | awk '{print $1}'` -y diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_ctypes_binding.sh similarity index 89% rename from ci/test_conda_pynvjitlink.sh rename to ci/test_conda_ctypes_binding.sh index 78c636a45..06aea95af 100755 --- a/ci/test_conda_pynvjitlink.sh +++ b/ci/test_conda_ctypes_binding.sh @@ -48,12 +48,6 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e - -rapids-logger "Install pynvjitlink" -set +u -rapids-mamba-retry install -c rapidsai pynvjitlink -set -u - rapids-logger "Build tests" PY_SCRIPT=" @@ -70,7 +64,7 @@ popd rapids-logger "Run Tests" -NUMBA_CUDA_ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v +NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v popd diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 3d75032a2..8e4687890 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -10,7 +10,8 @@ package=$(realpath wheel/numba_cuda*.whl) echo "Package path: ${package}" python -m pip install \ "${package}[test]" \ - "cuda-python==${CUDA_VER_MAJOR_MINOR%.*}.*" + "cuda-python==${CUDA_VER_MAJOR_MINOR%.*}.*" \ + "cuda-core==0.3.*" GET_TEST_BINARY_DIR=" import numba_cuda diff --git a/ci/test_wheel_pynvjitlink.sh b/ci/test_wheel_ctypes_binding.sh similarity index 84% rename from ci/test_wheel_pynvjitlink.sh rename to ci/test_wheel_ctypes_binding.sh index 4ede2b967..67bff6126 100755 --- a/ci/test_wheel_pynvjitlink.sh +++ b/ci/test_wheel_ctypes_binding.sh @@ -11,7 +11,6 @@ echo "Package path: $package" python -m pip install \ "${package}[test]" \ cuda-python \ - "pynvjitlink-cu${CUDA_VER_MAJOR}" rapids-logger "Build tests" PY_SCRIPT=" @@ -23,7 +22,7 @@ print(test_dir) NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$PY_SCRIPT") pushd $NUMBA_CUDA_TEST_BIN_DIR -make +NUMBA_CUDA_USE_NVIDIA_BINDING=0 make popd @@ -35,9 +34,9 @@ mkdir -p "${RAPIDS_TESTS_DIR}" pushd "${RAPIDS_TESTS_DIR}" rapids-logger "Show Numba system info" -python -m numba --sysinfo +NUMBA_CUDA_USE_NVIDIA_BINDING=0 python -m numba --sysinfo rapids-logger "Run Tests" -NUMBA_CUDA_ENABLE_PYNVJITLINK=1 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v +NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m numba.runtests numba.cuda.tests -v popd diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index 86c00476e..5aecbfcb7 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -29,6 +29,7 @@ requirements: - python - numba >=0.59.1 - cuda-bindings + - cuda-core ==0.3.* about: home: {{ project_urls["Homepage"] }} diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py index a17b5b186..e944fe0bf 100644 --- a/numba_cuda/numba/cuda/__init__.py +++ b/numba_cuda/numba/cuda/__init__.py @@ -2,11 +2,16 @@ from numba import runtests from numba.core import config from .utils import _readenv +import warnings + # Enable pynvjitlink based on the following precedence: # 1. Config setting "CUDA_ENABLE_PYNVJITLINK" (highest priority) # 2. Environment variable "NUMBA_CUDA_ENABLE_PYNVJITLINK" # 3. Auto-detection of pynvjitlink module (lowest priority) + +pynvjitlink_auto_enabled = False + if getattr(config, "CUDA_ENABLE_PYNVJITLINK", None) is None: if ( _pynvjitlink_enabled_in_env := _readenv( @@ -15,9 +20,10 @@ ) is not None: config.CUDA_ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env else: - config.CUDA_ENABLE_PYNVJITLINK = ( + pynvjitlink_auto_enabled = ( importlib.util.find_spec("pynvjitlink") is not None ) + config.CUDA_ENABLE_PYNVJITLINK = pynvjitlink_auto_enabled # Upstream numba sets CUDA_USE_NVIDIA_BINDING to 0 by default, so it always # exists. Override, but not if explicitly set to 0 in the envioronment. @@ -44,6 +50,21 @@ "bindings." ) +if config.CUDA_ENABLE_PYNVJITLINK: + if USE_NV_BINDING: + warnings.warn( + "Explicitly enabling pynvjitlink is no longer necessary. " + "NVIDIA bindings are enabled. cuda.core will be used " + "in place of pynvjitlink." + ) + elif pynvjitlink_auto_enabled: + # Ignore the fact that pynvjitlink is enabled, because that was an + # automatic decision based on discovering pynvjitlink was present; the + # user didn't ask for it + pass + else: + raise RuntimeError("nvJitLink requires the NVIDIA CUDA bindings. ") + if config.ENABLE_CUDASIM: from .simulator_init import * else: diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index 61c62a8f1..0b269ed52 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -22,7 +22,10 @@ def run_nvdisasm(cubin, flags): try: fd, fname = tempfile.mkstemp() with open(fname, "wb") as f: - f.write(cubin) + if config.CUDA_USE_NVIDIA_BINDING: + f.write(cubin.code) + else: + f.write(cubin) try: cp = subprocess.run( @@ -271,7 +274,7 @@ def get_cubin(self, cc=None): return cubin if self._lto and config.DUMP_ASSEMBLY: - linker = driver.Linker.new( + linker = driver._Linker.new( max_registers=self._max_registers, cc=cc, additional_flags=["-ptx"], @@ -280,14 +283,14 @@ def get_cubin(self, cc=None): # `-ptx` flag is meant to view the optimized PTX for LTO objects. # Non-LTO objects are not passed to linker. self._link_all(linker, cc, ignore_nonlto=True) - - ptx = linker.get_linked_ptx().decode("utf-8") + ptx = linker.get_linked_ptx() + ptx = ptx.decode("utf-8") print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-")) print(ptx) print("=" * 80) - linker = driver.Linker.new( + linker = driver._Linker.new( max_registers=self._max_registers, cc=cc, lto=self._lto ) self._link_all(linker, cc, ignore_nonlto=False) @@ -312,7 +315,6 @@ def get_cufunc(self): cufunc = self._cufunc_cache.get(device.id, None) if cufunc: return cufunc - cubin = self.get_cubin(cc=device.compute_capability) module = ctx.create_module_image( cubin, self._setup_functions, self._teardown_functions diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index a04f69753..b041d2688 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -42,6 +42,7 @@ import numpy as np from collections import namedtuple, deque + from numba import mviewbuf from numba.core import utils, serialize, config from .error import CudaSupportError, CudaDriverError @@ -58,6 +59,22 @@ NvJitLinker, NvJitLinkError = None, None +USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING + +if USE_NV_BINDING: + from cuda.bindings import driver as binding + from cuda.core.experimental import ( + Linker, + LinkerOptions, + ObjectCode, + ) + + # There is no definition of the default stream in the Nvidia bindings (nor + # is there at the C/C++ level), so we define it here so we don't need to + # use a magic number 0 in places where we want the default stream. + CU_STREAM_DEFAULT = 0 + + MIN_REQUIRED_CC = (3, 5) SUPPORTS_IPC = sys.platform.startswith("linux") @@ -108,6 +125,25 @@ def make_logger(): return logger +@functools.cache +def _have_nvjitlink(): + if not USE_NV_BINDING: + return False + try: + from cuda.bindings._internal import nvjitlink as nvjitlink_internal + from cuda.bindings._internal.utils import NotSupportedError + except ImportError: + return False + try: + return ( + nvjitlink_internal._inspect_function_pointer("__nvJitLinkVersion") + != 0 + ) + except NotSupportedError: + # no driver + return False + + class DeadMemoryError(RuntimeError): pass @@ -1472,7 +1508,7 @@ def create_module_ptx(self, ptx): if isinstance(ptx, str): ptx = ptx.encode("utf8") if USE_NV_BINDING: - image = ptx + image = ObjectCode.from_ptx(ptx) else: image = c_char_p(ptx) return self.create_module_image(image) @@ -1615,7 +1651,6 @@ def load_module_image_ctypes( option_keys = (drvapi.cu_jit_option * len(options))(*options.keys()) option_vals = (c_void_p * len(options))(*options.values()) - handle = drvapi.cu_module() try: driver.cuModuleLoadDataEx( @@ -1662,7 +1697,7 @@ def load_module_image_cuda_python( try: handle = driver.cuModuleLoadDataEx( - image, len(options), option_keys, option_vals + image.code, len(options), option_keys, option_vals ) except CudaAPIError as e: err_string = jiterrors.decode("utf-8") @@ -2722,7 +2757,7 @@ def launch_kernel( ) -class Linker(metaclass=ABCMeta): +class _LinkerBase(metaclass=ABCMeta): """Abstract base class for linkers""" @classmethod @@ -2735,30 +2770,27 @@ def new( additional_flags=None, ): driver_ver = driver.get_version() - if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and driver_ver >= ( - 12, - 0, - ): - raise ValueError("Use CUDA_ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC") - if config.CUDA_ENABLE_PYNVJITLINK and driver_ver < (12, 0): - raise ValueError("Enabling pynvjitlink requires CUDA 12.") - if config.CUDA_ENABLE_PYNVJITLINK: - linker = PyNvJitLinker - - elif config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: - linker = MVCLinker + if driver_ver < (12, 0): + if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: + linker = MVCLinker + elif USE_NV_BINDING: + linker = _Linker + else: + linker = CtypesLinker else: if USE_NV_BINDING: - linker = CudaPythonLinker + linker = _Linker else: linker = CtypesLinker - if linker is PyNvJitLinker: - return linker(max_registers, lineinfo, cc, lto, additional_flags) - elif additional_flags or lto: - raise ValueError("LTO and additional flags require PyNvJitLinker") + params = (max_registers, lineinfo, cc) + if linker is _Linker: + params = (*params, lto, additional_flags) else: - return linker(max_registers, lineinfo, cc) + if lto or additional_flags: + raise ValueError("LTO and additional flags require nvjitlink") + + return linker(*params) @abstractmethod def __init__(self, max_registers, lineinfo, cc): @@ -2786,7 +2818,6 @@ def add_cu(self, cu, name): with driver.get_active_context() as ac: dev = driver.get_device(ac.devnum) cc = dev.compute_capability - ptx, log = nvrtc.compile(cu, name, cc) if config.DUMP_ASSEMBLY: @@ -2821,7 +2852,6 @@ def add_file_guess_ext(self, path_or_code, ignore_nonlto=False): LTO-ed portion of the PTX when linker is added with objects that can be both LTO-ed and not LTO-ed. """ - if isinstance(path_or_code, str): ext = pathlib.Path(path_or_code).suffix if ext == "": @@ -2901,7 +2931,148 @@ def complete(self): """ -class MVCLinker(Linker): +class _Linker(_LinkerBase): + def __init__( + self, + max_registers=None, + lineinfo=False, + cc=None, + lto=None, + additional_flags=None, + ): + arch = f"sm_{cc[0]}{cc[1]}" + self.max_registers = max_registers if max_registers else None + self.lineinfo = lineinfo + self.cc = cc + self.arch = arch + if lto is False: + # WAR for apparent nvjitlink issue + lto = None + self.lto = lto + self.additional_flags = additional_flags + + self.options = LinkerOptions( + max_register_count=self.max_registers, + lineinfo=lineinfo, + arch=arch, + link_time_optimization=lto, + ) + self._complete = False + self._object_codes = [] + self.linker = None # need at least one program + + @property + def info_log(self): + if not self.linker: + raise ValueError("Not Initialized") + if self._complete: + return self._info_log + raise RuntimeError("Link not yet complete.") + + @property + def error_log(self): + if not self.linker: + raise ValueError("Not Initialized") + if self._complete: + return self._error_log + raise RuntimeError("Link not yet complete.") + + def add_ptx(self, ptx, name=""): + obj = ObjectCode.from_ptx(ptx, name=name) + self._object_codes.append(obj) + + def add_cu(self, cu, name=""): + with driver.get_active_context() as ac: + dev = driver.get_device(ac.devnum) + cc = dev.compute_capability + obj, log = nvrtc.compile(cu, name, cc, ltoir=self.lto) + + if not self.lto and config.DUMP_ASSEMBLY: + print(("ASSEMBLY %s" % name).center(80, "-")) + print(obj.code) + + self._object_codes.append(obj) + + def add_cubin(self, cubin, name=""): + obj = ObjectCode.from_cubin(cubin, name=name) + self._object_codes.append(obj) + + def add_ltoir(self, ltoir, name=""): + obj = ObjectCode.from_ltoir(ltoir, name=name) + self._object_codes.append(obj) + + def add_fatbin(self, fatbin, name=""): + obj = ObjectCode.from_fatbin(fatbin, name=name) + self._object_codes.append(obj) + + def add_object(self, obj, name=""): + obj = ObjectCode.from_object(obj, name=name) + self._object_codes.append(obj) + + def add_library(self, lib, name=""): + obj = ObjectCode.from_library(lib, name=name) + self._object_codes.append(obj) + + def add_file(self, path, kind): + try: + data = cached_file_read(path, how="rb") + except FileNotFoundError: + raise LinkerError(f"{path} not found") + name = pathlib.Path(path).name + self.add_data(data, kind, name) + + def add_data(self, data, kind, name): + if kind == FILE_EXTENSION_MAP["ptx"]: + fn = self.add_ptx + elif kind == FILE_EXTENSION_MAP["cubin"]: + fn = self.add_cubin + elif kind == "cu": + fn = self.add_cu + elif ( + kind == FILE_EXTENSION_MAP["lib"] or kind == FILE_EXTENSION_MAP["a"] + ): + fn = self.add_library + elif kind == FILE_EXTENSION_MAP["fatbin"]: + fn = self.add_fatbin + elif kind == FILE_EXTENSION_MAP["o"]: + fn = self.add_object + elif kind == FILE_EXTENSION_MAP["ltoir"]: + fn = self.add_ltoir + else: + raise LinkerError(f"Don't know how to link {kind}") + + fn(data, name) + + def get_linked_ptx(self): + options = LinkerOptions( + max_register_count=self.max_registers, + lineinfo=self.lineinfo, + arch=self.arch, + link_time_optimization=True, + ptx=True, + ) + + self.linker = Linker(*self._object_codes, options=options) + + result = self.linker.link("ptx") + self.close() + self._complete = True + return result.code + + def close(self): + self._info_log = self.linker.get_info_log() + self._error_log = self.linker.get_error_log() + self.linker.close() + + def complete(self): + self.linker = Linker(*self._object_codes, options=self.options) + result = self.linker.link("cubin") + self.close() + self._complete = True + return result + + +class MVCLinker(_LinkerBase): """ Linker supporting Minor Version Compatibility, backed by the cubinlinker package. @@ -2996,7 +3167,7 @@ def complete(self): raise LinkerError from e -class CtypesLinker(Linker): +class CtypesLinker(_LinkerBase): """ Links for current device if no CC given """ @@ -3139,266 +3310,6 @@ def complete(self): return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,))) -class CudaPythonLinker(Linker): - """ - Links for current device if no CC given - """ - - def __init__(self, max_registers=0, lineinfo=False, cc=None): - super().__init__(max_registers, lineinfo, cc) - - logsz = config.CUDA_LOG_SIZE - linkerinfo = bytearray(logsz) - linkererrors = bytearray(logsz) - - jit_option = binding.CUjit_option - - options = { - jit_option.CU_JIT_INFO_LOG_BUFFER: linkerinfo, - jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz, - jit_option.CU_JIT_ERROR_LOG_BUFFER: linkererrors, - jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz, - jit_option.CU_JIT_LOG_VERBOSE: 1, - } - if max_registers: - options[jit_option.CU_JIT_MAX_REGISTERS] = max_registers - if lineinfo: - options[jit_option.CU_JIT_GENERATE_LINE_INFO] = 1 - - if cc is None: - # No option value is needed, but we need something as a placeholder - options[jit_option.CU_JIT_TARGET_FROM_CUCONTEXT] = 1 - else: - cc_val = cc[0] * 10 + cc[1] - cc_enum = getattr( - binding.CUjit_target, f"CU_TARGET_COMPUTE_{cc_val}" - ) - options[jit_option.CU_JIT_TARGET] = cc_enum - - raw_keys = list(options.keys()) - raw_values = list(options.values()) - self.handle = driver.cuLinkCreate(len(raw_keys), raw_keys, raw_values) - - weakref.finalize(self, driver.cuLinkDestroy, self.handle) - - self.linker_info_buf = linkerinfo - self.linker_errors_buf = linkererrors - - self._keep_alive = [linkerinfo, linkererrors, raw_keys, raw_values] - - @property - def info_log(self): - return self.linker_info_buf.decode("utf8") - - @property - def error_log(self): - return self.linker_errors_buf.decode("utf8") - - def add_cubin(self, cubin, name=""): - input_type = binding.CUjitInputType.CU_JIT_INPUT_CUBIN - return self._add_data(input_type, cubin, name) - - def add_ptx(self, ptx, name=""): - input_type = binding.CUjitInputType.CU_JIT_INPUT_PTX - return self._add_data(input_type, ptx, name) - - def add_object(self, object_, name=""): - input_type = binding.CUjitInputType.CU_JIT_INPUT_OBJECT - return self._add_data(input_type, object_, name) - - def add_fatbin(self, fatbin, name=""): - input_type = binding.CUjitInputType.CU_JIT_INPUT_FATBINARY - return self._add_data(input_type, fatbin, name) - - def add_library(self, library, name=""): - input_type = binding.CUjitInputType.CU_JIT_INPUT_LIBRARY - return self._add_data(input_type, library, name) - - def _add_data(self, input_type, data, name): - name_buffer = name.encode("utf8") - self._keep_alive += [data, name_buffer] - try: - driver.cuLinkAddData( - self.handle, input_type, data, len(data), name_buffer, 0, [], [] - ) - except CudaAPIError as e: - raise LinkerError("%s\n%s" % (e, self.error_log)) - - def add_data(self, data, kind, name=None): - # We pass the name as **kwargs to ensure the default name for the input - # type is used if none is supplied - kws = {} - if name is not None: - kws["name"] = name - - if kind == FILE_EXTENSION_MAP["cubin"]: - self.add_cubin(data, **kws) - elif kind == FILE_EXTENSION_MAP["fatbin"]: - self.add_fatbin(data, **kws) - elif kind == FILE_EXTENSION_MAP["a"]: - self.add_library(data, **kws) - elif kind == FILE_EXTENSION_MAP["ptx"]: - self.add_ptx(data, **kws) - elif kind == FILE_EXTENSION_MAP["o"]: - self.add_object(data, **kws) - elif kind == FILE_EXTENSION_MAP["ltoir"]: - raise LinkerError("CudaPythonLinker cannot link LTO-IR") - else: - raise LinkerError(f"Don't know how to link {kind}") - - def add_file(self, path, kind): - pathbuf = path.encode("utf8") - self._keep_alive.append(pathbuf) - - try: - driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, [], []) - except CudaAPIError as e: - if e.code == binding.CUresult.CUDA_ERROR_FILE_NOT_FOUND: - msg = f"{path} not found" - else: - msg = "%s\n%s" % (e, self.error_log) - raise LinkerError(msg) - - def complete(self): - try: - cubin_buf, size = driver.cuLinkComplete(self.handle) - except CudaAPIError as e: - raise LinkerError("%s\n%s" % (e, self.error_log)) - - assert size > 0, "linker returned a zero sized cubin" - del self._keep_alive[:] - # We return a copy of the cubin because it's owned by the linker - cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char)) - return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,))) - - -class PyNvJitLinker(Linker): - def __init__( - self, - max_registers=None, - lineinfo=False, - cc=None, - lto=False, - additional_flags=None, - ): - if NvJitLinker is None: - raise ImportError( - "Using pynvjitlink requires the pynvjitlink package to be " - "available" - ) - - if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY: - raise ValueError( - "Can't set CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and " - "CUDA_ENABLE_PYNVJITLINK at the same time" - ) - - if cc is None: - raise RuntimeError("PyNvJitLinker requires CC to be specified") - if not any(isinstance(cc, t) for t in [list, tuple]): - raise TypeError("`cc` must be a list or tuple of length 2") - - sm_ver = f"{cc[0] * 10 + cc[1]}" - arch = f"-arch=sm_{sm_ver}" - options = [arch] - if max_registers: - options.append(f"-maxrregcount={max_registers}") - if lineinfo: - options.append("-lineinfo") - if lto: - options.append("-lto") - if additional_flags is not None: - options.extend(additional_flags) - - self._linker = NvJitLinker(*options) - self.lto = lto - self.options = options - - @property - def info_log(self): - return self._linker.info_log - - @property - def error_log(self): - return self._linker.error_log - - def add_ptx(self, ptx, name=""): - self._linker.add_ptx(ptx, name) - - def add_fatbin(self, fatbin, name=""): - self._linker.add_fatbin(fatbin, name) - - def add_ltoir(self, ltoir, name=""): - self._linker.add_ltoir(ltoir, name) - - def add_object(self, obj, name=""): - self._linker.add_object(obj, name) - - def add_file(self, path, kind): - try: - data = cached_file_read(path, "rb") - except FileNotFoundError: - raise LinkerError(f"{path} not found") - - name = pathlib.Path(path).name - self.add_data(data, kind, name) - - def add_cu(self, cu, name): - """Add CUDA source in a string to the link. The name of the source - file should be specified in `name`.""" - with driver.get_active_context() as ac: - dev = driver.get_device(ac.devnum) - cc = dev.compute_capability - - program, log = nvrtc.compile(cu, name, cc, ltoir=self.lto) - - if not self.lto and config.DUMP_ASSEMBLY: - print(("ASSEMBLY %s" % name).center(80, "-")) - print(program) - print("=" * 80) - - suffix = ".ltoir" if self.lto else ".ptx" - program_name = os.path.splitext(name)[0] + suffix - # Link the program's PTX or LTOIR using the normal linker mechanism - if self.lto: - self.add_ltoir(program, program_name) - else: - self.add_ptx(program.encode(), program_name) - - def add_data(self, data, kind, name): - if kind == FILE_EXTENSION_MAP["cubin"]: - fn = self._linker.add_cubin - elif kind == FILE_EXTENSION_MAP["fatbin"]: - fn = self._linker.add_fatbin - elif kind == FILE_EXTENSION_MAP["a"]: - fn = self._linker.add_library - elif kind == FILE_EXTENSION_MAP["ptx"]: - return self.add_ptx(data, name) - elif kind == FILE_EXTENSION_MAP["o"]: - fn = self._linker.add_object - elif kind == FILE_EXTENSION_MAP["ltoir"]: - fn = self._linker.add_ltoir - else: - raise LinkerError(f"Don't know how to link {kind}") - - try: - fn(data, name) - except NvJitLinkError as e: - raise LinkerError from e - - def get_linked_ptx(self): - try: - return self._linker.get_linked_ptx() - except NvJitLinkError as e: - raise LinkerError from e - - def complete(self): - try: - return self._linker.get_linked_cubin() - except NvJitLinkError as e: - raise LinkerError from e - - # ----------------------------------------------------------------------------- diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py index ca785e681..3ddc60724 100644 --- a/numba_cuda/numba/cuda/cudadrv/mappings.py +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -2,17 +2,16 @@ from . import enums if config.CUDA_USE_NVIDIA_BINDING: - from cuda.bindings import driver + from cuda.bindings.driver import CUjitInputType - jitty = driver.CUjitInputType FILE_EXTENSION_MAP = { - "o": jitty.CU_JIT_INPUT_OBJECT, - "ptx": jitty.CU_JIT_INPUT_PTX, - "a": jitty.CU_JIT_INPUT_LIBRARY, - "lib": jitty.CU_JIT_INPUT_LIBRARY, - "cubin": jitty.CU_JIT_INPUT_CUBIN, - "fatbin": jitty.CU_JIT_INPUT_FATBINARY, - "ltoir": jitty.CU_JIT_INPUT_NVVM, + "o": CUjitInputType.CU_JIT_INPUT_OBJECT, + "ptx": CUjitInputType.CU_JIT_INPUT_PTX, + "a": CUjitInputType.CU_JIT_INPUT_LIBRARY, + "lib": CUjitInputType.CU_JIT_INPUT_LIBRARY, + "cubin": CUjitInputType.CU_JIT_INPUT_CUBIN, + "fatbin": CUjitInputType.CU_JIT_INPUT_FATBINARY, + "ltoir": CUjitInputType.CU_JIT_INPUT_NVVM, } else: FILE_EXTENSION_MAP = { diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py index b01c8c60c..2513e2a48 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py +++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py @@ -27,6 +27,9 @@ # Result code nvrtc_result = c_int +if config.CUDA_USE_NVIDIA_BINDING: + from cuda.core.experimental import Program, ProgramOptions + class NvrtcResult(IntEnum): NVRTC_SUCCESS = 0 @@ -374,10 +377,14 @@ def compile(src, name, cc, ltoir=False): # - Relocatable Device Code (rdc) is needed to prevent device functions # being optimized away. major, minor = found - arch = f"--gpu-architecture=compute_{major}{minor}" + + if config.CUDA_USE_NVIDIA_BINDING: + arch = f"sm_{major}{minor}" + else: + arch = f"--gpu-architecture=compute_{major}{minor}" cuda_include = [ - f"-I{get_cuda_paths()['include_dir'].info}", + f"{get_cuda_paths()['include_dir'].info}", ] nvrtc_version = nvrtc.get_version() @@ -387,54 +394,83 @@ def compile(src, name, cc, ltoir=False): numba_cuda_path = os.path.dirname(cudadrv_path) if nvrtc_ver_major == 11: - numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '11')}" + numba_include = f"{os.path.join(numba_cuda_path, 'include', '11')}" else: - numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}" + numba_include = f"{os.path.join(numba_cuda_path, 'include', '12')}" if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS: - extra_search_paths = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":") - extra_includes = [f"-I{p}" for p in extra_search_paths] + extra_includes = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":") else: extra_includes = [] - nrt_path = os.path.join(numba_cuda_path, "memory_management") - nrt_include = f"-I{nrt_path}" - - options = [ - arch, - numba_include, - *cuda_include, - nrt_include, - *extra_includes, - "-rdc", - "true", - ] + nrt_include = os.path.join(numba_cuda_path, "memory_management") - if ltoir: - options.append("-dlto") + includes = [numba_include, *cuda_include, nrt_include, *extra_includes] - if nvrtc_version < (12, 0): - options += ["-std=c++17"] + if config.CUDA_USE_NVIDIA_BINDING: + options = ProgramOptions( + arch=arch, + include_path=includes, + relocatable_device_code=True, + std="c++17" if nvrtc_version < (12, 0) else None, + link_time_optimization=ltoir, + name=name, + ) - # Compile the program - compile_error = nvrtc.compile_program(program, options) + class Logger: + def __init__(self): + self.log = [] - # Get log from compilation - log = nvrtc.get_compile_log(program) + def write(self, msg): + self.log.append(msg) - # If the compile failed, provide the log in an exception - if compile_error: - msg = f"NVRTC Compilation failure whilst compiling {name}:\n\n{log}" - raise NvrtcError(msg) + logger = Logger() + if isinstance(src, bytes): + src = src.decode("utf8") - # Otherwise, if there's any content in the log, present it as a warning - if log: - msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}" - warnings.warn(msg) + prog = Program(src, "c++", options=options) + result = prog.compile("ltoir" if ltoir else "ptx", logs=logger) + log = "" + if logger.log: + log = logger.log + joined_logs = "\n".join(log) + warnings.warn(f"NVRTC log messages: {joined_logs}") + return result, log - if ltoir: - ltoir = nvrtc.get_lto(program) - return ltoir, log else: - ptx = nvrtc.get_ptx(program) - return ptx, log + includes = [f"-I{path}" for path in includes] + options = [ + arch, + *includes, + "-rdc", + "true", + ] + + if ltoir: + options.append("-dlto") + + if nvrtc_version < (12, 0): + options.append("-std=c++17") + + # Compile the program + compile_error = nvrtc.compile_program(program, options) + + # Get log from compilation + log = nvrtc.get_compile_log(program) + + # If the compile failed, provide the log in an exception + if compile_error: + msg = f"NVRTC Compilation failure whilst compiling {name}:\n\n{log}" + raise NvrtcError(msg) + + # Otherwise, if there's any content in the log, present it as a warning + if log: + msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}" + warnings.warn(msg) + + if ltoir: + ltoir = nvrtc.get_lto(program) + return ltoir, log + else: + ptx = nvrtc.get_ptx(program) + return ptx, log diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py index 704d7c9f1..d1b096a05 100644 --- a/numba_cuda/numba/cuda/decorators.py +++ b/numba_cuda/numba/cuda/decorators.py @@ -4,6 +4,7 @@ from numba.cuda.compiler import declare_device_function from numba.cuda.dispatcher import CUDADispatcher from numba.cuda.simulator.kernel import FakeCUDAKernel +from numba.cuda.cudadrv.driver import _have_nvjitlink _msg_deprecated_signature_arg = ( @@ -143,10 +144,13 @@ def jit( if lto is None: # Default to using LTO if pynvjitlink is available and we're not debugging - lto = config.CUDA_ENABLE_PYNVJITLINK and not debug + lto = _have_nvjitlink() and not debug else: - if lto and not config.CUDA_ENABLE_PYNVJITLINK: - raise RuntimeError("LTO requires pynvjitlink, which is not enabled") + if lto and not _have_nvjitlink(): + raise RuntimeError( + "LTO requires nvjitlink, which is not available" + "or not sufficiently recent (>=12.3)" + ) if sigutils.is_signature(func_or_sig): signatures = [func_or_sig] diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index f7f7f8948..08bd96098 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -129,6 +129,7 @@ def __init__( nvvm_options["g"] = None cc = get_current_device().compute_capability + cres = compile_cuda( self.py_func, types.void, diff --git a/numba_cuda/numba/cuda/memory_management/nrt.py b/numba_cuda/numba/cuda/memory_management/nrt.py index 8edf40b2e..d6e4f53ec 100644 --- a/numba_cuda/numba/cuda/memory_management/nrt.py +++ b/numba_cuda/numba/cuda/memory_management/nrt.py @@ -6,7 +6,7 @@ from numba import cuda, config from numba.core.runtime.nrt import _nrt_mstats from numba.cuda.cudadrv.driver import ( - Linker, + _Linker, driver, launch_kernel, USE_NV_BINDING, @@ -80,7 +80,7 @@ def _compile_memsys_module(self): cc = get_current_device().compute_capability # Create a new linker instance and add the cu file - linker = Linker.new(cc=cc) + linker = _Linker.new(cc=cc) linker.add_cu_file(memsys_mod) # Complete the linker and create a module from it diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py index 924924c93..8376969f1 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py @@ -34,10 +34,10 @@ def get_device_count(self): driver = FakeDriver() -class Linker: +class _Linker: @classmethod def new(cls, max_registers=0, lineinfo=False, cc=None): - return Linker() + return _Linker() @property def lto(self): @@ -67,3 +67,7 @@ def launch_kernel(*args, **kwargs): if config.ENABLE_CUDASIM: config.CUDA_ENABLE_PYNVJITLINK = False + + +def _have_nvjitlink(): + return False diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py index d0bb25b68..745514a9f 100644 --- a/numba_cuda/numba/cuda/testing.py +++ b/numba_cuda/numba/cuda/testing.py @@ -39,7 +39,7 @@ def skip_if_lto(self, reason): # Some linkers need the compute capability to be specified, so we # always specify it here. cc = devices.get_context().device.compute_capability - linker = driver.Linker.new(cc=cc) + linker = driver._Linker.new(cc=cc) if linker.lto: self.skipTest(reason) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py index 6b16216f4..ed503dab6 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py @@ -1,14 +1,14 @@ import numpy as np import warnings +from numba import config from numba.cuda.testing import unittest from numba.cuda.testing import skip_on_cudasim, skip_if_cuda_includes_missing from numba.cuda.testing import CUDATestCase, test_data_dir -from numba.cuda.cudadrv.driver import CudaAPIError, Linker, LinkerError -from numba.cuda.cudadrv.error import NvrtcError +from numba.cuda.cudadrv.driver import CudaAPIError, _Linker, LinkerError from numba.cuda import require_context from numba.tests.support import ignore_internal_warnings from numba import cuda, void, float64, int64, int32, typeof, float32 - +from numba.cuda.cudadrv.error import NvrtcError CONST1D = np.arange(10, dtype=np.float64) @@ -107,7 +107,7 @@ class TestLinker(CUDATestCase): @require_context def test_linker_basic(self): """Simply go through the constructor and destructor""" - linker = Linker.new(cc=(5, 3)) + linker = _Linker.new(cc=(7, 5)) del linker def _test_linking(self, eager): @@ -183,7 +183,13 @@ def test_linking_cu_error(self): link = str(test_data_dir / "error.cu") - with self.assertRaises(NvrtcError) as e: + if config.CUDA_USE_NVIDIA_BINDING: + from cuda.core.experimental._utils.cuda_utils import NVRTCError + + errty = NVRTCError + else: + errty = NvrtcError + with self.assertRaises(errty) as e: @cuda.jit("void(int32)", link=[link]) def kernel(x): @@ -191,7 +197,12 @@ def kernel(x): msg = e.exception.args[0] # Check the error message refers to the NVRTC compile - self.assertIn("NVRTC Compilation failure", msg) + nvrtc_err_str = ( + "NVRTC_ERROR_COMPILATION" + if config.CUDA_USE_NVIDIA_BINDING + else "NVRTC Compilation failure" + ) + self.assertIn(nvrtc_err_str, msg) # Check the expected error in the CUDA source is reported self.assertIn('identifier "SYNTAX" is undefined', msg) # Check the filename is reported correctly diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 8dca18901..97a580187 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -1,22 +1,12 @@ from numba.cuda.testing import unittest from numba.cuda.testing import skip_on_cudasim from numba.cuda.testing import CUDATestCase -from numba.cuda.cudadrv.driver import PyNvJitLinker from numba.cuda import get_current_device +from numba.cuda.cudadrv.driver import _Linker, _have_nvjitlink from numba import cuda from numba import config -from numba.tests.support import run_in_subprocess, override_config -try: - import pynvjitlink # noqa: F401 - - PYNVJITLINK_INSTALLED = True -except ImportError: - PYNVJITLINK_INSTALLED = False - - -import itertools import os import io import contextlib @@ -52,85 +42,13 @@ @unittest.skipIf( - not config.CUDA_ENABLE_PYNVJITLINK or not TEST_BIN_DIR, - "pynvjitlink not enabled", + not config.CUDA_USE_NVIDIA_BINDING + or not TEST_BIN_DIR + or not _have_nvjitlink(), + "NVIDIA cuda bindings not enabled or nvJitLink not installed or new enough (>12.3)", ) @skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): - def test_nvjitlink_create(self): - patched_linker = PyNvJitLinker(cc=(7, 5)) - assert "-arch=sm_75" in patched_linker.options - - def test_nvjitlink_create_no_cc_error(self): - # nvJitLink expects at least the architecture to be specified. - with self.assertRaisesRegex( - RuntimeError, "PyNvJitLinker requires CC to be specified" - ): - PyNvJitLinker() - - def test_nvjitlink_invalid_arch_error(self): - from pynvjitlink.api import NvJitLinkError - - # CC 0.0 is not a valid compute capability - with self.assertRaisesRegex( - NvJitLinkError, "NVJITLINK_ERROR_UNRECOGNIZED_OPTION error" - ): - PyNvJitLinker(cc=(0, 0)) - - def test_nvjitlink_invalid_cc_type_error(self): - with self.assertRaisesRegex( - TypeError, "`cc` must be a list or tuple of length 2" - ): - PyNvJitLinker(cc=0) - - def test_nvjitlink_ptx_compile_options(self): - max_registers = (None, 32) - lineinfo = (False, True) - lto = (False, True) - additional_flags = (None, ("-g",), ("-g", "-time")) - for ( - max_registers_i, - line_info_i, - lto_i, - additional_flags_i, - ) in itertools.product(max_registers, lineinfo, lto, additional_flags): - with self.subTest( - max_registers=max_registers_i, - lineinfo=line_info_i, - lto=lto_i, - additional_flags=additional_flags_i, - ): - patched_linker = PyNvJitLinker( - cc=(7, 5), - max_registers=max_registers_i, - lineinfo=line_info_i, - lto=lto_i, - additional_flags=additional_flags_i, - ) - assert "-arch=sm_75" in patched_linker.options - - if max_registers_i: - assert ( - f"-maxrregcount={max_registers_i}" - in patched_linker.options - ) - else: - assert "-maxrregcount" not in patched_linker.options - - if line_info_i: - assert "-lineinfo" in patched_linker.options - else: - assert "-lineinfo" not in patched_linker.options - - if lto_i: - assert "-lto" in patched_linker.options - else: - assert "-lto" not in patched_linker.options - - if additional_flags_i: - for flag in additional_flags_i: - assert flag in patched_linker.options - def test_nvjitlink_add_file_guess_ext_linkable_code(self): files = ( test_device_functions_a, @@ -142,24 +60,20 @@ def test_nvjitlink_add_file_guess_ext_linkable_code(self): ) for file in files: with self.subTest(file=file): - patched_linker = PyNvJitLinker( - cc=get_current_device().compute_capability - ) - patched_linker.add_file_guess_ext(file) + linker = _Linker(cc=get_current_device().compute_capability) + linker.add_file_guess_ext(file) def test_nvjitlink_test_add_file_guess_ext_invalid_input(self): with open(test_device_functions_cubin, "rb") as f: content = f.read() - patched_linker = PyNvJitLinker( - cc=get_current_device().compute_capability - ) + linker = _Linker(cc=get_current_device().compute_capability) with self.assertRaisesRegex( TypeError, "Expected path to file or a LinkableCode" ): # Feeding raw data as bytes to add_file_guess_ext should raise, # because there's no way to know what kind of file to treat it as - patched_linker.add_file_guess_ext(content) + linker.add_file_guess_ext(content) def test_nvjitlink_jit_with_linkable_code(self): files = ( @@ -261,77 +175,5 @@ def kernel(): pass -@unittest.skipIf( - not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR, - reason="pynvjitlink not enabled", -) -@skip_on_cudasim("Linking unsupported in the simulator") -class TestLinkerUsage(CUDATestCase): - """Test that whether pynvjitlink can be enabled by both environment variable - and modification of config at runtime. - """ - - src = """if 1: - import os - from numba import cuda, config - - {config} - - TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR") - if TEST_BIN_DIR: - test_device_functions_cubin = os.path.join( - TEST_BIN_DIR, "test_device_functions.cubin" - ) - - sig = "uint32(uint32, uint32)" - add_from_numba = cuda.declare_device("add_from_numba", sig) - - @cuda.jit(link=[test_device_functions_cubin], lto=True) - def kernel(result): - result[0] = add_from_numba(1, 2) - - result = cuda.device_array(1) - kernel[1, 1](result) - assert result[0] == 3 - """ - - def test_linker_enabled_envvar(self): - env = os.environ.copy() - env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None) - run_in_subprocess(self.src.format(config=""), env=env) - - def test_linker_disabled_envvar(self): - env = os.environ.copy() - env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0" - with self.assertRaisesRegex( - AssertionError, "LTO requires pynvjitlink, which is not enabled" - ): - # Actual error raised is `ValueError`, but `run_in_subprocess` - # reraises as AssertionError. - run_in_subprocess(self.src.format(config=""), env=env) - - def test_linker_enabled_config(self): - env = os.environ.copy() - env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None) - run_in_subprocess( - self.src.format(config="config.CUDA_ENABLE_PYNVJITLINK = True"), - env=env, - ) - - def test_linker_disabled_config(self): - env = os.environ.copy() - env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None) - with override_config("CUDA_ENABLE_PYNVJITLINK", False): - with self.assertRaisesRegex( - AssertionError, "LTO requires pynvjitlink, which is not enabled" - ): - run_in_subprocess( - self.src.format( - config="config.CUDA_ENABLE_PYNVJITLINK = False" - ), - env=env, - ) - - if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py index c663da0a6..b1f6a1835 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py @@ -1,7 +1,7 @@ from numba import cuda from numba.core.errors import TypingError from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim -from numba.tests.support import override_config +from numba import config def noop(x): @@ -91,15 +91,17 @@ def kernel_func(): self.assertIn("NameError: name 'floor' is not defined", excstr) @skip_on_cudasim("Simulator does not use pynvjitlink") - def test_lto_without_pynvjitlink_error(self): - with self.assertRaisesRegex(RuntimeError, "LTO requires pynvjitlink"): - with override_config("CUDA_ENABLE_PYNVJITLINK", False): + @unittest.skipIf( + config.CUDA_USE_NVIDIA_BINDING, "NVIDIA cuda bindings enabled" + ) + def test_lto_without_nvjitlink_error(self): + with self.assertRaisesRegex(RuntimeError, "LTO requires nvjitlink"): - @cuda.jit(lto=True) - def f(): - pass + @cuda.jit(lto=True) + def f(): + pass - f[1, 1]() + f[1, 1]() if __name__ == "__main__": diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_extending.py b/numba_cuda/numba/cuda/tests/cudapy/test_extending.py index 3dc24b60b..fdfb54464 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_extending.py @@ -1,4 +1,5 @@ from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase +from numba.cuda.cudadrv.driver import _have_nvjitlink from llvmlite import ir import numpy as np @@ -210,7 +211,7 @@ def test_extension_adds_linkable_code(self): (test_device_functions_ltoir, cuda.LTOIR), ) - lto = config.CUDA_ENABLE_PYNVJITLINK + lto = _have_nvjitlink() for path, ctor in files: if ctor == cuda.LTOIR and not lto: diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py index 81591b58e..2e14549cc 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py @@ -169,7 +169,16 @@ def test_nrt_detect_linked_ptx_file(self): cc = get_current_device().compute_capability ptx, _ = compile(src, "external_nrt.cu", cc) - @cuda.jit(link=[PTXSource(ptx.encode(), nrt=True)]) + @cuda.jit( + link=[ + PTXSource( + ptx.code + if config.CUDA_USE_NVIDIA_BINDING + else ptx.encode(), + nrt=True, + ) + ] + ) def kernel(): allocate_deallocate_handle() diff --git a/pyproject.toml b/pyproject.toml index 51cca1cba..476be59ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = ["numba>=0.59.1"] [project.optional-dependencies] cu11 = [ "cuda-bindings==11.8.*", + "cuda-core==0.3.*", "cuda-python==11.8.*", # supports all CTK 11.x "nvidia-cuda-nvcc-cu11", # for libNVVM "nvidia-cuda-runtime-cu11", @@ -30,10 +31,12 @@ cu11 = [ ] cu12 = [ "cuda-bindings==12.9.*", + "cuda-core==0.3.*", "cuda-python==12.9.*", # supports all CTK 12.x "nvidia-cuda-nvcc-cu12", # for libNVVM "nvidia-cuda-runtime-cu12", "nvidia-cuda-nvrtc-cu12", + "nvidia-nvjitlink-cu12" ] test = [ "psutil",