diff --git a/numba_cuda/numba/cuda/args.py b/numba_cuda/numba/cuda/args.py index 755e19d00..9adecfde9 100644 --- a/numba_cuda/numba/cuda/args.py +++ b/numba_cuda/numba/cuda/args.py @@ -6,16 +6,13 @@ memory transfers before & after the kernel call. """ -import abc - from numba.cuda.typing.typeof import typeof, Purpose -class ArgHint(metaclass=abc.ABCMeta): +class ArgHint: def __init__(self, value): self.value = value - @abc.abstractmethod def to_device(self, retr, stream=0): """ :param stream: a stream to use when copying data @@ -25,7 +22,6 @@ def to_device(self, retr, stream=0): :return: a value (usually an `DeviceNDArray`) to be passed to the kernel """ - pass @property def _numba_type_(self): diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index e8dc27f3b..900aac6bd 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -199,10 +199,11 @@ def _numba_type_(self): @property def device_ctypes_pointer(self): """Returns the ctypes pointer to the GPU data buffer""" - if self.gpu_data is None: - return c_void_p(0) - else: + try: + # apparently faster in the non-exceptional case return self.gpu_data.device_ctypes_pointer + except AttributeError: + return c_void_p(0) @devices.require_context def copy_to_device(self, ary, stream=0): diff --git a/numba_cuda/numba/cuda/cudadrv/devices.py b/numba_cuda/numba/cuda/cudadrv/devices.py index c2e678421..514191384 100644 --- a/numba_cuda/numba/cuda/cudadrv/devices.py +++ b/numba_cuda/numba/cuda/cudadrv/devices.py @@ -21,23 +21,14 @@ from .driver import driver -class _DeviceList(object): - def __getattr__(self, attr): - # First time looking at "lst" attribute. - if attr == "lst": - # Device list is not initialized. - # Query all CUDA devices. - numdev = driver.get_device_count() - gpus = [ - _DeviceContextManager(driver.get_device(devid)) - for devid in range(numdev) - ] - # Define "lst" to avoid re-initialization - self.lst = gpus - return gpus - - # Other attributes - return super(_DeviceList, self).__getattr__(attr) +class _DeviceList: + @property + @functools.cache + def lst(self): + return [ + _DeviceContextManager(driver.get_device(devid)) + for devid in range(driver.get_device_count()) + ] def __getitem__(self, devnum): """ @@ -79,6 +70,9 @@ class _DeviceContextManager(object): def __init__(self, device): self._device = device + # Forwarded directly, to avoid the performance overhead of + # `__getattr__` and method lookup for a commonly accessed method + self.get_primary_context = self._device.get_primary_context def __getattr__(self, item): return getattr(self._device, item) @@ -88,10 +82,10 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): # this will verify that we are popping the right device context. - self._device.get_primary_context().pop() + self.get_primary_context().pop() def __str__(self): - return "".format(self=self) + return f"" class _Runtime(object): @@ -147,7 +141,8 @@ def get_or_create_context(self, devnum): return attached_ctx else: devnum = int(devnum) - return self._activate_context_for(devnum) + with self._lock: + return self._activate_context_for(devnum) def _get_or_create_context_uncached(self, devnum): """See also ``get_or_create_context(devnum)``. @@ -166,28 +161,29 @@ def _get_or_create_context_uncached(self, devnum): ctx_handle = ctx.handle.value ac_ctx_handle = ac.context_handle.value if ctx_handle != ac_ctx_handle: - msg = ( + raise RuntimeError( "Numba cannot operate on non-primary" - " CUDA context {:x}" + f" CUDA context {ac_ctx_handle:x}" ) - raise RuntimeError(msg.format(ac_ctx_handle)) # Ensure the context is ready ctx.prepare_for_use() return ctx def _activate_context_for(self, devnum): - with self._lock: - gpu = self.gpus[devnum] - newctx = gpu.get_primary_context() - # Detect unexpected context switch - cached_ctx = self._get_attached_context() - if cached_ctx is not None and cached_ctx is not newctx: - raise RuntimeError("Cannot switch CUDA-context.") - newctx.push() - return newctx + gpu = self.gpus[devnum] + newctx = gpu.get_primary_context() + # Detect unexpected context switch + cached_ctx = self._get_attached_context() + if cached_ctx is not None and cached_ctx is not newctx: + raise RuntimeError("Cannot switch CUDA-context.") + newctx.push() + return newctx def _get_attached_context(self): - return getattr(self._tls, "attached_context", None) + try: + return self._tls.attached_context + except AttributeError: + return None def _set_attached_context(self, ctx): self._tls.attached_context = ctx diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 69541df47..8bdd4f2af 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -470,10 +470,11 @@ class _ActiveContext(object): def __enter__(self): is_top = False # check TLS cache - if hasattr(self._tls_cache, "ctx_devnum"): - hctx, devnum = self._tls_cache.ctx_devnum - # Not cached. Query the driver API. - else: + cache = self._tls_cache + try: + hctx, devnum = cache.ctx_devnum + except AttributeError: + # Not cached. Query the driver API. hctx = driver.cuCtxGetCurrent() if int(hctx) == 0: hctx = None @@ -495,7 +496,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): if self._is_top: - delattr(self._tls_cache, "ctx_devnum") + del self._tls_cache.ctx_devnum def __bool__(self): """Returns True is there's a valid and active CUDA context.""" @@ -2061,6 +2062,10 @@ def deref(): self._mem.refct += 1 weakref.finalize(self, deref) + # pull this attribute out for speed, because it's used often and + # there's overhead to going through `__getattr__` + self.device_ctypes_pointer = self._view.device_ctypes_pointer + def __getattr__(self, fname): """Proxy MemoryPointer methods""" return getattr(self._view, fname) @@ -3072,7 +3077,11 @@ def is_device_memory(obj): "device_pointer" which value is an int object carrying the pointer value of the device memory address. This is not tested in this method. """ - return getattr(obj, "__cuda_memory__", False) + try: + # This is cheaper than getattr in the non-exceptional case + return obj.__cuda_memory__ + except AttributeError: + return False def require_device_memory(obj): diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index d9da06203..617562379 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -558,10 +558,8 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs): kernelargs.append(nitems) kernelargs.append(itemsize) kernelargs.append(data) - for ax in range(devary.ndim): - kernelargs.append(c_intp(devary.shape[ax])) - for ax in range(devary.ndim): - kernelargs.append(c_intp(devary.strides[ax])) + kernelargs.extend(map(c_intp, devary.shape)) + kernelargs.extend(map(c_intp, devary.strides)) elif isinstance(ty, types.CPointer): # Pointer arguments should be a pointer-sized integer diff --git a/numba_cuda/numba/cuda/simulator/kernel.py b/numba_cuda/numba/cuda/simulator/kernel.py index 36cf20a2a..a654cb77b 100644 --- a/numba_cuda/numba/cuda/simulator/kernel.py +++ b/numba_cuda/numba/cuda/simulator/kernel.py @@ -11,7 +11,7 @@ from .cudadrv.devicearray import FakeCUDAArray, FakeWithinKernelCUDAArray from .kernelapi import Dim3, FakeCUDAModule, swapped_cuda_module from ..errors import normalize_kernel_dimensions -from ..args import wrap_arg, ArgHint +from ..args import ArgHint, InOut """ @@ -109,7 +109,7 @@ def fake_arg(arg): ) if isinstance(arg, np.ndarray) and arg.ndim > 0: - ret = wrap_arg(arg).to_device(retr) + ret = InOut(arg).to_device(retr) elif isinstance(arg, ArgHint): ret = arg.to_device(retr) elif isinstance(arg, np.void):