diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py index dbb2d94c9..e58b3f588 100644 --- a/numba_cuda/numba/cuda/api.py +++ b/numba_cuda/numba/cuda/api.py @@ -271,12 +271,8 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0): # compute size size = np.prod(shape) * dtype.itemsize # manually recreate the IPC mem handle - if driver.USE_NV_BINDING: - driver_handle = driver.binding.CUipcMemHandle() - driver_handle.reserved = handle - else: - driver_handle = driver.drvapi.cu_ipc_mem_handle() - driver_handle.reserved[:] = handle + driver_handle = driver.binding.CUipcMemHandle() + driver_handle.reserved = handle # use *IpcHandle* to open the IPC memory ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset) yield ipchandle.open_array( diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 4d38806a1..e8dc27f3b 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -112,10 +112,7 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): gpu_data = devices.get_context().memalloc(self.alloc_size) else: # Make NULL pointer for empty allocation - if _driver.USE_NV_BINDING: - null = _driver.binding.CUdeviceptr(0) - else: - null = c_void_p(0) + null = _driver.binding.CUdeviceptr(0) gpu_data = _driver.MemoryPointer( context=devices.get_context(), pointer=null, size=0 ) diff --git a/numba_cuda/numba/cuda/cudadrv/devices.py b/numba_cuda/numba/cuda/cudadrv/devices.py index 19ac5e7b8..c2e678421 100644 --- a/numba_cuda/numba/cuda/cudadrv/devices.py +++ b/numba_cuda/numba/cuda/cudadrv/devices.py @@ -18,7 +18,7 @@ import threading from contextlib import contextmanager -from .driver import driver, USE_NV_BINDING +from .driver import driver class _DeviceList(object): @@ -43,7 +43,7 @@ def __getitem__(self, devnum): """ Returns the context manager for device *devnum*. """ - if not isinstance(devnum, (int, slice)) and USE_NV_BINDING: + if not isinstance(devnum, (int, slice)): devnum = int(devnum) return self.lst[devnum] @@ -146,8 +146,7 @@ def get_or_create_context(self, devnum): else: return attached_ctx else: - if USE_NV_BINDING: - devnum = int(devnum) + devnum = int(devnum) return self._activate_context_for(devnum) def _get_or_create_context_uncached(self, devnum): diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 1337d77ec..69541df47 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -63,10 +63,6 @@ ObjectCode, ) -# For backwards compatibility: indicate that the NVIDIA CUDA Python bindings are -# in use. Older code checks this flag to branch on binding-specific behavior. -USE_NV_BINDING = True - # There is no definition of the default stream in the Nvidia bindings (nor # is there at the C/C++ level), so we define it here so we don't need to # use a magic number 0 in places where we want the default stream. @@ -3176,11 +3172,7 @@ def device_memset(dst, val, size, stream=0): try: fn(ptr, val, size, *varargs) except CudaAPIError as e: - invalid = ( - binding.CUresult.CUDA_ERROR_INVALID_VALUE - if USE_NV_BINDING - else enums.CUDA_ERROR_INVALID_VALUE - ) + invalid = binding.CUresult.CUDA_ERROR_INVALID_VALUE if ( e.code == invalid and getattr(dst, "__cuda_memory__", False) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index eac850468..d9da06203 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -475,11 +475,7 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): for t, v in zip(self.argument_types, args): self._prepare_args(t, v, stream, retr, kernelargs) - if driver.USE_NV_BINDING: - stream_handle = stream and stream.handle.value or 0 - else: - zero_stream = None - stream_handle = stream and stream.handle or zero_stream + stream_handle = stream and stream.handle.value or 0 # Invoke kernel driver.launch_kernel( @@ -553,8 +549,7 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs): ptr = driver.device_pointer(devary) - if driver.USE_NV_BINDING: - ptr = int(ptr) + ptr = int(ptr) data = ctypes.c_void_p(ptr) diff --git a/numba_cuda/numba/cuda/memory_management/nrt.py b/numba_cuda/numba/cuda/memory_management/nrt.py index 2e103f457..70ad01fe4 100644 --- a/numba_cuda/numba/cuda/memory_management/nrt.py +++ b/numba_cuda/numba/cuda/memory_management/nrt.py @@ -14,7 +14,6 @@ _Linker, driver, launch_kernel, - USE_NV_BINDING, _have_nvjitlink, ) from numba.cuda.cudadrv import devices @@ -163,8 +162,7 @@ def allocate(self, stream=None): memsys_size = ctypes.c_uint64() ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size") device_memsys_size = ptr.device_ctypes_pointer - if USE_NV_BINDING: - device_memsys_size = device_memsys_size.value + device_memsys_size = device_memsys_size.value driver.cuMemcpyDtoH( ctypes.addressof(memsys_size), device_memsys_size, nbytes ) diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py index 821d7af84..679112cab 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py @@ -62,8 +62,5 @@ def launch_kernel(*args, **kwargs): raise RuntimeError(msg) -USE_NV_BINDING = False - - def _have_nvjitlink(): return False diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py index b3d211214..196d08897 100644 --- a/numba_cuda/numba/cuda/testing.py +++ b/numba_cuda/numba/cuda/testing.py @@ -299,10 +299,6 @@ def xfail_unless_cudasim(fn): return unittest.expectedFailure(fn) -def skip_with_cuda_python(reason): - return unittest.skipIf(driver.USE_NV_BINDING, reason) - - def cudadevrt_missing(): if config.ENABLE_CUDASIM: return False diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py b/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py index 5935a1e2e..66825bd7e 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: BSD-2-Clause import numbers -from ctypes import byref import weakref from numba import cuda @@ -31,9 +30,6 @@ def test_gpus_cudevice_indexing(self): device_ids = [device.id for device in cuda.list_devices()] for device_id in device_ids: with cuda.gpus[device_id]: - # Check that the device is an integer if not using the CUDA - # Python bindings, otherwise it's a CUdevice object - assert isinstance(device_id, int) != driver.USE_NV_BINDING self.assertEqual(cuda.gpus.current.id, device_id) @@ -91,14 +87,9 @@ def tearDown(self): def test_attached_primary(self, extra_work=lambda: None): # Emulate primary context creation by 3rd party the_driver = driver.driver - if driver.USE_NV_BINDING: - dev = driver.binding.CUdevice(0) - binding_hctx = the_driver.cuDevicePrimaryCtxRetain(dev) - hctx = driver.drvapi.cu_context(int(binding_hctx)) - else: - dev = 0 - hctx = driver.drvapi.cu_context() - the_driver.cuDevicePrimaryCtxRetain(byref(hctx), dev) + dev = driver.binding.CUdevice(0) + binding_hctx = the_driver.cuDevicePrimaryCtxRetain(dev) + hctx = driver.drvapi.cu_context(int(binding_hctx)) try: ctx = driver.Context(weakref.proxy(self), hctx) ctx.push() @@ -115,33 +106,29 @@ def test_attached_primary(self, extra_work=lambda: None): def test_attached_non_primary(self): # Emulate non-primary context creation by 3rd party the_driver = driver.driver - if driver.USE_NV_BINDING: - flags = 0 - dev = driver.binding.CUdevice(0) - - result, version = driver.binding.cuDriverGetVersion() - self.assertEqual( - result, - driver.binding.CUresult.CUDA_SUCCESS, - "Error getting CUDA driver version", - ) - - # CUDA 13's cuCtxCreate has an optional parameter prepended. The - # version of cuCtxCreate in use depends on the cuda.bindings major - # version rather than the installed driver version on the machine - # we're running on. - from cuda import bindings - - bindings_version = int(bindings.__version__.split(".")[0]) - if bindings_version in (11, 12): - args = (flags, dev) - else: - args = (None, flags, dev) - - hctx = the_driver.cuCtxCreate(*args) + flags = 0 + dev = driver.binding.CUdevice(0) + + result, version = driver.binding.cuDriverGetVersion() + self.assertEqual( + result, + driver.binding.CUresult.CUDA_SUCCESS, + "Error getting CUDA driver version", + ) + + # CUDA 13's cuCtxCreate has an optional parameter prepended. The + # version of cuCtxCreate in use depends on the cuda.bindings major + # version rather than the installed driver version on the machine + # we're running on. + from cuda import bindings + + bindings_version = int(bindings.__version__.split(".")[0]) + if bindings_version in (11, 12): + args = (flags, dev) else: - hctx = driver.drvapi.cu_context() - the_driver.cuCtxCreate(byref(hctx), 0, 0) + args = (None, flags, dev) + + hctx = the_driver.cuCtxCreate(*args) try: cuda.current_context() except RuntimeError as e: diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index fca1ff628..60abcf664 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-2-Clause -from ctypes import byref, c_int, sizeof +from ctypes import c_int, sizeof from numba.cuda.cudadrv.driver import ( host_to_device, @@ -9,7 +9,7 @@ driver, launch_kernel, ) -from numba.cuda.cudadrv import devices, drvapi, driver as _driver +from numba.cuda.cudadrv import devices, driver as _driver from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim @@ -96,8 +96,7 @@ def test_cuda_driver_basic(self): ptr = memory.device_ctypes_pointer stream = 0 - if _driver.USE_NV_BINDING: - stream = _driver.binding.CUstream(stream) + stream = _driver.binding.CUstream(stream) launch_kernel( function.handle, # Kernel @@ -133,8 +132,7 @@ def test_cuda_driver_stream_operations(self): ptr = memory.device_ctypes_pointer stream_handle = stream.handle - if _driver.USE_NV_BINDING: - stream_handle = stream_handle.value + stream_handle = stream_handle.value launch_kernel( function.handle, # Kernel @@ -195,13 +193,8 @@ def test_cuda_driver_external_stream(self): # Test properties of a stream created from an external stream object. # We use the driver API directly to create a stream, to emulate an # external library creating a stream - if _driver.USE_NV_BINDING: - handle = driver.cuStreamCreate(0) - ptr = int(handle) - else: - handle = drvapi.cu_stream() - driver.cuStreamCreate(byref(handle), 0) - ptr = handle.value + handle = driver.cuStreamCreate(0) + ptr = int(handle) s = self.context.create_external_stream(ptr) self.assertIn("External CUDA stream", repr(s)) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py index 7d8c216f9..d6852c8c5 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py @@ -54,10 +54,7 @@ def test_derived_pointer(self): # Use MemoryPointer.view to create derived pointer def handle_val(mem): - if driver.USE_NV_BINDING: - return int(mem.handle) - else: - return mem.handle.value + return int(mem.handle) def check(m, offset): # create view diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py b/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py index f503b5443..e768d9dbb 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py @@ -2,8 +2,7 @@ # SPDX-License-Identifier: BSD-2-Clause import numpy as np -from ctypes import byref, c_size_t -from numba.cuda.cudadrv.driver import device_memset, driver, USE_NV_BINDING +from numba.cuda.cudadrv.driver import device_memset, driver from numba import cuda from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim, skip_on_arm @@ -22,14 +21,8 @@ def get_total_gpu_memory(self): # We use a driver function to directly get the total GPU memory because # an EMM plugin may report something different (or not implement # get_memory_info at all). - if USE_NV_BINDING: - free, total = driver.cuMemGetInfo() - return total - else: - free = c_size_t() - total = c_size_t() - driver.cuMemGetInfo(byref(free), byref(total)) - return total.value + free, total = driver.cuMemGetInfo() + return total def skip_if_cc_major_lt(self, min_required, reason): """ diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py b/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py deleted file mode 100644 index e4ae1b88d..000000000 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +++ /dev/null @@ -1,163 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: BSD-2-Clause - -import multiprocessing as mp -import logging -import traceback -from numba.cuda.testing import unittest, CUDATestCase -from numba.cuda.testing import ( - skip_on_cudasim, - skip_with_cuda_python, - skip_under_cuda_memcheck, -) -from numba.cuda.tests.support import linux_only - - -def child_test(): - from numba import cuda, int32, void - from numba.cuda.core import config - import io - import numpy as np - import threading - - # Enable PTDS before we make any CUDA driver calls. Enabling it first - # ensures that PTDS APIs are used because the CUDA driver looks up API - # functions on first use and memoizes them. - config.CUDA_PER_THREAD_DEFAULT_STREAM = 1 - - # Set up log capture for the Driver API so we can see what API calls were - # used. - logbuf = io.StringIO() - handler = logging.StreamHandler(logbuf) - cudadrv_logger = logging.getLogger("numba.cuda.cudadrv.driver") - cudadrv_logger.addHandler(handler) - cudadrv_logger.setLevel(logging.DEBUG) - - # Set up data for our test, and copy over to the device - N = 2**16 - N_THREADS = 10 - N_ADDITIONS = 4096 - - # Seed the RNG for repeatability - np.random.seed(1) - x = np.random.randint(low=0, high=1000, size=N, dtype=np.int32) - r = np.zeros_like(x) - - # One input and output array for each thread - xs = [cuda.to_device(x) for _ in range(N_THREADS)] - rs = [cuda.to_device(r) for _ in range(N_THREADS)] - - # Compute the grid size and get the [per-thread] default stream - n_threads = 256 - n_blocks = N // n_threads - stream = cuda.default_stream() - - # A simple multiplication-by-addition kernel. What it does exactly is not - # too important; only that we have a kernel that does something. - @cuda.jit(void(int32[::1], int32[::1])) - def f(r, x): - i = cuda.grid(1) - - if i > len(r): - return - - # Accumulate x into r - for j in range(N_ADDITIONS): - r[i] += x[i] - - # This function will be used to launch the kernel from each thread on its - # own unique data. - def kernel_thread(n): - f[n_blocks, n_threads, stream](rs[n], xs[n]) - - # Create threads - threads = [ - threading.Thread(target=kernel_thread, args=(i,)) - for i in range(N_THREADS) - ] - - # Start all threads - for thread in threads: - thread.start() - - # Wait for all threads to finish, to ensure that we don't synchronize with - # the device until all kernels are scheduled. - for thread in threads: - thread.join() - - # Synchronize with the device - cuda.synchronize() - - # Check output is as expected - expected = x * N_ADDITIONS - for i in range(N_THREADS): - np.testing.assert_equal(rs[i].copy_to_host(), expected) - - # Return the driver log output to the calling process for checking - handler.flush() - return logbuf.getvalue() - - -def child_test_wrapper(result_queue): - try: - output = child_test() - success = True - # Catch anything raised so it can be propagated - except: # noqa: E722 - output = traceback.format_exc() - success = False - - result_queue.put((success, output)) - - -# Run on Linux only until the reason for test hangs on Windows (Issue #8635, -# https://github.com/numba/numba/issues/8635) is diagnosed -@linux_only -@skip_under_cuda_memcheck("Hangs cuda-memcheck") -@skip_on_cudasim("Streams not supported on the simulator") -class TestPTDS(CUDATestCase): - @skip_with_cuda_python("Function names unchanged for PTDS with NV Binding") - def test_ptds(self): - # Run a test with PTDS enabled in a child process - ctx = mp.get_context("spawn") - result_queue = ctx.Queue() - proc = ctx.Process(target=child_test_wrapper, args=(result_queue,)) - proc.start() - proc.join() - success, output = result_queue.get() - - # Ensure the child process ran to completion before checking its output - if not success: - self.fail(output) - - # Functions with a per-thread default stream variant that we expect to - # see in the output - ptds_functions = ( - "cuMemcpyHtoD_v2_ptds", - "cuLaunchKernel_ptsz", - "cuMemcpyDtoH_v2_ptds", - ) - - for fn in ptds_functions: - with self.subTest(fn=fn, expected=True): - self.assertIn(fn, output) - - # Non-PTDS versions of the functions that we should not see in the - # output: - legacy_functions = ( - "cuMemcpyHtoD_v2", - "cuLaunchKernel", - "cuMemcpyDtoH_v2", - ) - - for fn in legacy_functions: - with self.subTest(fn=fn, expected=False): - # Ensure we only spot these function names appearing without a - # _ptds or _ptsz suffix by checking including the end of the - # line in the log - fn_at_end = f"{fn}\n" - self.assertNotIn(fn_at_end, output) - - -if __name__ == "__main__": - unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py index 6e2c353a2..0c01cdcd9 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py @@ -9,7 +9,6 @@ import numpy as np from numba import cuda -from numba.cuda.cudadrv import driver from numba.cuda.testing import ( skip_on_arm, skip_on_cudasim, @@ -106,10 +105,7 @@ def test_ipc_handle(self): ipch = ctx.get_ipc_handle(devarr.gpu_data) # manually prepare for serialization as bytes - if driver.USE_NV_BINDING: - handle_bytes = ipch.handle.reserved - else: - handle_bytes = bytes(ipch.handle) + handle_bytes = ipch.handle.reserved size = ipch.size # spawn new process for testing @@ -153,12 +149,7 @@ def check_ipc_handle_serialization(self, index_arg=None, foreign=False): self.assertIs(ipch_recon.base, None) self.assertEqual(ipch_recon.size, ipch.size) - if driver.USE_NV_BINDING: - self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) - else: - self.assertEqual( - ipch_recon.handle.reserved[:], ipch.handle.reserved[:] - ) + self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) # spawn new process for testing ctx = mp.get_context("spawn") @@ -281,12 +272,7 @@ def test_staged(self): buf = pickle.dumps(ipch) ipch_recon = pickle.loads(buf) self.assertIs(ipch_recon.base, None) - if driver.USE_NV_BINDING: - self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) - else: - self.assertEqual( - ipch_recon.handle.reserved[:], ipch.handle.reserved[:] - ) + self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) self.assertEqual(ipch_recon.size, ipch.size) # Test on every CUDA devices