diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py index 752b20240..caacfd7c8 100644 --- a/numba_cuda/numba/cuda/api.py +++ b/numba_cuda/numba/cuda/api.py @@ -39,10 +39,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True): shape = desc["shape"] strides = desc.get("strides") - dtype = np.dtype(desc["typestr"]) shape, strides, dtype = prepare_shape_strides_dtype( - shape, strides, dtype, order="C" + shape, strides, desc["typestr"], order="C" ) size = driver.memory_size_from_info(shape, strides, dtype.itemsize) diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py index 60cd29da5..580b4eb20 100644 --- a/numba_cuda/numba/cuda/api_util.py +++ b/numba_cuda/numba/cuda/api_util.py @@ -3,6 +3,8 @@ import numpy as np +import functools + def prepare_shape_strides_dtype(shape, strides, dtype, order): dtype = np.dtype(dtype) @@ -14,25 +16,33 @@ def prepare_shape_strides_dtype(shape, strides, dtype, order): raise TypeError("shape must be an integer or tuple of integers") if isinstance(shape, int): shape = (shape,) + else: + shape = tuple(shape) if isinstance(strides, int): strides = (strides,) else: - strides = strides or _fill_stride_by_order(shape, dtype, order) + if not strides: + strides = _fill_stride_by_order(shape, dtype, order) + else: + strides = tuple(strides) return shape, strides, dtype +@functools.cache def _fill_stride_by_order(shape, dtype, order): - nd = len(shape) - if nd == 0: + ndims = len(shape) + if not ndims: return () - strides = [0] * nd + strides = [0] * ndims if order == "C": strides[-1] = dtype.itemsize - for d in reversed(range(nd - 1)): + # -2 because we subtract one for zero-based indexing and another one + # for skipping the already-filled-in last element + for d in range(ndims - 2, -1, -1): strides[d] = strides[d + 1] * shape[d + 1] elif order == "F": strides[0] = dtype.itemsize - for d in range(1, nd): + for d in range(1, ndims): strides[d] = strides[d - 1] * shape[d - 1] else: raise ValueError("must be either C/F order") diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 2b805e20c..6cdb12424 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -86,8 +86,13 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): """ if isinstance(shape, int): shape = (shape,) + else: + shape = tuple(shape) if isinstance(strides, int): strides = (strides,) + else: + if strides: + strides = tuple(strides) dtype = np.dtype(dtype) itemsize = dtype.itemsize self.ndim = ndim = len(shape) @@ -96,9 +101,6 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): self._dummy = dummy = dummyarray.Array.from_desc( 0, shape, strides, itemsize ) - # confirm that all elements of shape are ints - if not all(isinstance(dim, (int, np.integer)) for dim in shape): - raise TypeError("all elements of shape must be ints") self.shape = shape = dummy.shape self.strides = strides = dummy.strides self.dtype = dtype @@ -121,17 +123,17 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): @property def __cuda_array_interface__(self): - if self.device_ctypes_pointer.value is not None: - ptr = self.device_ctypes_pointer.value + if (value := self.device_ctypes_pointer.value) is not None: + ptr = value else: ptr = 0 return { - "shape": tuple(self.shape), + "shape": self.shape, "strides": None if is_contiguous(self) else tuple(self.strides), "data": (ptr, False), "typestr": self.dtype.str, - "stream": int(self.stream) if self.stream != 0 else None, + "stream": int(stream) if (stream := self.stream) != 0 else None, "version": 3, } diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index b45483d3f..d15835557 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -3023,6 +3023,7 @@ def host_memory_extents(obj): return mviewbuf.memoryview_get_extents(obj) +@functools.cache def memory_size_from_info(shape, strides, itemsize): """Get the byte size of a contiguous memory buffer given the shape, strides and itemsize. diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py index ab2e0b8e2..2a344d8d1 100644 --- a/numba_cuda/numba/cuda/cudadrv/dummyarray.py +++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py @@ -5,6 +5,7 @@ import itertools import functools import operator +import numpy as np Extent = namedtuple("Extent", ["begin", "end"]) @@ -245,9 +246,12 @@ class Array(object): is_array = True @classmethod + @functools.cache def from_desc(cls, offset, shape, strides, itemsize): dims = [] for ashape, astride in zip(shape, strides): + if not isinstance(ashape, (int, np.integer)): + raise TypeError("all elements of shape must be ints") dim = Dim( offset, offset + ashape * astride, ashape, astride, single=False ) @@ -442,8 +446,8 @@ def reshape(self, *newdims, **kws): ret = self.from_desc( self.extent.begin, - shape=newdims, - strides=newstrides, + shape=tuple(newdims), + strides=tuple(newstrides), itemsize=self.itemsize, ) @@ -471,8 +475,8 @@ def squeeze(self, axis=None): newstrides.append(stride) newarr = self.from_desc( self.extent.begin, - shape=newshape, - strides=newstrides, + shape=tuple(newshape), + strides=tuple(newstrides), itemsize=self.itemsize, ) return newarr, list(self.iter_contiguous_extent()) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 617562379..3d9f45ddf 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -1629,11 +1629,15 @@ def typeof_pyval(self, val): try: return typeof(val, Purpose.argument) except ValueError: - if cuda.is_cuda_array(val): + if ( + interface := getattr(val, "__cuda_array_interface__") + ) is not None: # When typing, we don't need to synchronize on the array's # stream - this is done when the kernel is launched. + return typeof( - cuda.as_cuda_array(val, sync=False), Purpose.argument + cuda.from_cuda_array_interface(interface, sync=False), + Purpose.argument, ) else: raise diff --git a/pixi.lock b/pixi.lock index 4f6fa2a5c..9ba47994d 100644 --- a/pixi.lock +++ b/pixi.lock @@ -6054,7 +6054,7 @@ packages: - pypi: ./ name: numba-cuda version: 0.20.0 - sha256: 36ad961e5a11798c0d1a4bacb080e0b71c3f0ad10ca22a896b0d096a924924aa + sha256: 26f7cba2368d24de58dc72aa0869f7892d5ef115dcf2fe452f5203f4cef3e809 requires_dist: - numba>=0.60.0 - cuda-bindings>=12.9.1,<14.0.0 diff --git a/pyproject.toml b/pyproject.toml index 74e436ae6..ab8a8920b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -198,7 +198,7 @@ benchcmp = { cmd = [ "numba.cuda.tests.benchmarks", "--benchmark-only", "--benchmark-enable", - "--benchmark-group-by=func", + "--benchmark-group-by=name", "--benchmark-compare", ] }