Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions numba_cuda/numba/cuda/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,13 @@
memory transfers before & after the kernel call.
"""

import abc

from numba.cuda.typing.typeof import typeof, Purpose


class ArgHint(metaclass=abc.ABCMeta):
Comment thread
cpcloud marked this conversation as resolved.
class ArgHint:
def __init__(self, value):
self.value = value

@abc.abstractmethod
def to_device(self, retr, stream=0):
"""
:param stream: a stream to use when copying data
Expand All @@ -25,7 +22,6 @@ def to_device(self, retr, stream=0):
:return: a value (usually an `DeviceNDArray`) to be passed to
the kernel
"""
pass

@property
def _numba_type_(self):
Expand Down
7 changes: 4 additions & 3 deletions numba_cuda/numba/cuda/cudadrv/devicearray.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,11 @@ def _numba_type_(self):
@property
def device_ctypes_pointer(self):
"""Returns the ctypes pointer to the GPU data buffer"""
if self.gpu_data is None:
return c_void_p(0)
else:
try:
# apparently faster in the non-exceptional case
return self.gpu_data.device_ctypes_pointer
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is faster because after Python 3.11, there's no cost to the non-exceptional case, and this case appears to be more common in the kernel launching path than the case of self.gpu_data is None being True.

except AttributeError:
return c_void_p(0)

@devices.require_context
def copy_to_device(self, ary, stream=0):
Expand Down
62 changes: 29 additions & 33 deletions numba_cuda/numba/cuda/cudadrv/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,14 @@
from .driver import driver


class _DeviceList(object):
def __getattr__(self, attr):
# First time looking at "lst" attribute.
if attr == "lst":
# Device list is not initialized.
# Query all CUDA devices.
numdev = driver.get_device_count()
gpus = [
_DeviceContextManager(driver.get_device(devid))
for devid in range(numdev)
]
# Define "lst" to avoid re-initialization
self.lst = gpus
return gpus

# Other attributes
return super(_DeviceList, self).__getattr__(attr)
class _DeviceList:
@property
@functools.cache
def lst(self):
return [
_DeviceContextManager(driver.get_device(devid))
for devid in range(driver.get_device_count())
]

def __getitem__(self, devnum):
"""
Expand Down Expand Up @@ -79,6 +70,9 @@ class _DeviceContextManager(object):

def __init__(self, device):
self._device = device
# Forwarded directly, to avoid the performance overhead of
# `__getattr__` and method lookup for a commonly accessed method
self.get_primary_context = self._device.get_primary_context

def __getattr__(self, item):
return getattr(self._device, item)
Expand All @@ -88,10 +82,10 @@ def __enter__(self):

def __exit__(self, exc_type, exc_val, exc_tb):
# this will verify that we are popping the right device context.
self._device.get_primary_context().pop()
self.get_primary_context().pop()

def __str__(self):
return "<Managed Device {self.id}>".format(self=self)
return f"<Managed Device {self.id}>"


class _Runtime(object):
Expand Down Expand Up @@ -147,7 +141,8 @@ def get_or_create_context(self, devnum):
return attached_ctx
else:
devnum = int(devnum)
return self._activate_context_for(devnum)
with self._lock:
return self._activate_context_for(devnum)

def _get_or_create_context_uncached(self, devnum):
"""See also ``get_or_create_context(devnum)``.
Expand All @@ -166,28 +161,29 @@ def _get_or_create_context_uncached(self, devnum):
ctx_handle = ctx.handle.value
ac_ctx_handle = ac.context_handle.value
if ctx_handle != ac_ctx_handle:
msg = (
raise RuntimeError(
"Numba cannot operate on non-primary"
" CUDA context {:x}"
f" CUDA context {ac_ctx_handle:x}"
)
raise RuntimeError(msg.format(ac_ctx_handle))
# Ensure the context is ready
ctx.prepare_for_use()
return ctx

def _activate_context_for(self, devnum):
with self._lock:
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This lock (while reentrant, so not incorrect) was held inside of another section, so I just inlined the lock-holding to the one place this method was being called without a lock, and was able to remove one acquire and release operation.

gpu = self.gpus[devnum]
newctx = gpu.get_primary_context()
# Detect unexpected context switch
cached_ctx = self._get_attached_context()
if cached_ctx is not None and cached_ctx is not newctx:
raise RuntimeError("Cannot switch CUDA-context.")
newctx.push()
return newctx
gpu = self.gpus[devnum]
newctx = gpu.get_primary_context()
# Detect unexpected context switch
cached_ctx = self._get_attached_context()
if cached_ctx is not None and cached_ctx is not newctx:
raise RuntimeError("Cannot switch CUDA-context.")
newctx.push()
return newctx

def _get_attached_context(self):
return getattr(self._tls, "attached_context", None)
try:
return self._tls.attached_context
except AttributeError:
return None

def _set_attached_context(self, ctx):
self._tls.attached_context = ctx
Expand Down
21 changes: 15 additions & 6 deletions numba_cuda/numba/cuda/cudadrv/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,10 +470,11 @@ class _ActiveContext(object):
def __enter__(self):
is_top = False
# check TLS cache
if hasattr(self._tls_cache, "ctx_devnum"):
hctx, devnum = self._tls_cache.ctx_devnum
# Not cached. Query the driver API.
else:
cache = self._tls_cache
try:
hctx, devnum = cache.ctx_devnum
except AttributeError:
# Not cached. Query the driver API.
hctx = driver.cuCtxGetCurrent()
if int(hctx) == 0:
hctx = None
Expand All @@ -495,7 +496,7 @@ def __enter__(self):

def __exit__(self, exc_type, exc_val, exc_tb):
if self._is_top:
delattr(self._tls_cache, "ctx_devnum")
del self._tls_cache.ctx_devnum

def __bool__(self):
"""Returns True is there's a valid and active CUDA context."""
Expand Down Expand Up @@ -2061,6 +2062,10 @@ def deref():
self._mem.refct += 1
weakref.finalize(self, deref)

# pull this attribute out for speed, because it's used often and
# there's overhead to going through `__getattr__`
self.device_ctypes_pointer = self._view.device_ctypes_pointer

def __getattr__(self, fname):
"""Proxy MemoryPointer methods"""
return getattr(self._view, fname)
Expand Down Expand Up @@ -3072,7 +3077,11 @@ def is_device_memory(obj):
"device_pointer" which value is an int object carrying the pointer
value of the device memory address. This is not tested in this method.
"""
return getattr(obj, "__cuda_memory__", False)
try:
# This is cheaper than getattr in the non-exceptional case
Comment thread
cpcloud marked this conversation as resolved.
return obj.__cuda_memory__
except AttributeError:
return False


def require_device_memory(obj):
Expand Down
6 changes: 2 additions & 4 deletions numba_cuda/numba/cuda/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,10 +558,8 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
kernelargs.append(nitems)
kernelargs.append(itemsize)
kernelargs.append(data)
for ax in range(devary.ndim):
kernelargs.append(c_intp(devary.shape[ax]))
for ax in range(devary.ndim):
kernelargs.append(c_intp(devary.strides[ax]))
kernelargs.extend(map(c_intp, devary.shape))
kernelargs.extend(map(c_intp, devary.strides))

elif isinstance(ty, types.CPointer):
# Pointer arguments should be a pointer-sized integer
Expand Down
4 changes: 2 additions & 2 deletions numba_cuda/numba/cuda/simulator/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .cudadrv.devicearray import FakeCUDAArray, FakeWithinKernelCUDAArray
from .kernelapi import Dim3, FakeCUDAModule, swapped_cuda_module
from ..errors import normalize_kernel_dimensions
from ..args import wrap_arg, ArgHint
from ..args import ArgHint, InOut


"""
Expand Down Expand Up @@ -109,7 +109,7 @@ def fake_arg(arg):
)

if isinstance(arg, np.ndarray) and arg.ndim > 0:
ret = wrap_arg(arg).to_device(retr)
ret = InOut(arg).to_device(retr)
elif isinstance(arg, ArgHint):
ret = arg.to_device(retr)
elif isinstance(arg, np.void):
Expand Down