From d7a6c5a87f8d3b1b6f810e75eabb6e446c378304 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:32:44 -0400 Subject: [PATCH 01/13] chore: benchcmp by name to make per-benchmark comparisons easier --- pixi.lock | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pixi.lock b/pixi.lock index 4f6fa2a5c..9ba47994d 100644 --- a/pixi.lock +++ b/pixi.lock @@ -6054,7 +6054,7 @@ packages: - pypi: ./ name: numba-cuda version: 0.20.0 - sha256: 36ad961e5a11798c0d1a4bacb080e0b71c3f0ad10ca22a896b0d096a924924aa + sha256: 26f7cba2368d24de58dc72aa0869f7892d5ef115dcf2fe452f5203f4cef3e809 requires_dist: - numba>=0.60.0 - cuda-bindings>=12.9.1,<14.0.0 diff --git a/pyproject.toml b/pyproject.toml index 74e436ae6..ab8a8920b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -198,7 +198,7 @@ benchcmp = { cmd = [ "numba.cuda.tests.benchmarks", "--benchmark-only", "--benchmark-enable", - "--benchmark-group-by=func", + "--benchmark-group-by=name", "--benchmark-compare", ] } From 779437ce764355268feffe9f0af73463ed116817 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:33:03 -0400 Subject: [PATCH 02/13] perf: cache `_fill_stride_by_order` --- numba_cuda/numba/cuda/api_util.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py index 60cd29da5..b8631da8d 100644 --- a/numba_cuda/numba/cuda/api_util.py +++ b/numba_cuda/numba/cuda/api_util.py @@ -3,6 +3,8 @@ import numpy as np +import functools + def prepare_shape_strides_dtype(shape, strides, dtype, order): dtype = np.dtype(dtype) @@ -14,13 +16,19 @@ def prepare_shape_strides_dtype(shape, strides, dtype, order): raise TypeError("shape must be an integer or tuple of integers") if isinstance(shape, int): shape = (shape,) + else: + shape = tuple(shape) if isinstance(strides, int): strides = (strides,) else: - strides = strides or _fill_stride_by_order(shape, dtype, order) + if not strides: + strides = _fill_stride_by_order(tuple(shape), dtype, order) + else: + strides = tuple(strides) return shape, strides, dtype +@functools.cache def _fill_stride_by_order(shape, dtype, order): nd = len(shape) if nd == 0: From 80e0a8f3149989e0d79a58a6d19942e46216a9f3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:33:19 -0400 Subject: [PATCH 03/13] perf: cache `memory_size_from_info` --- numba_cuda/numba/cuda/cudadrv/driver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index b45483d3f..d15835557 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -3023,6 +3023,7 @@ def host_memory_extents(obj): return mviewbuf.memoryview_get_extents(obj) +@functools.cache def memory_size_from_info(shape, strides, itemsize): """Get the byte size of a contiguous memory buffer given the shape, strides and itemsize. From 57644f99a4bb4e2744d2629e7022b219163604aa Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:33:31 -0400 Subject: [PATCH 04/13] perf: cache `Array.from_desc` --- numba_cuda/numba/cuda/cudadrv/dummyarray.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py index ab2e0b8e2..98bef04ea 100644 --- a/numba_cuda/numba/cuda/cudadrv/dummyarray.py +++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py @@ -245,6 +245,7 @@ class Array(object): is_array = True @classmethod + @functools.cache def from_desc(cls, offset, shape, strides, itemsize): dims = [] for ashape, astride in zip(shape, strides): From f4c15c70da784ccafa24f209aae90909ffdedbf1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:33:41 -0400 Subject: [PATCH 05/13] perf: move type checking into cached function --- numba_cuda/numba/cuda/cudadrv/devicearray.py | 3 --- numba_cuda/numba/cuda/cudadrv/dummyarray.py | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 2b805e20c..e44c1e598 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -96,9 +96,6 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): self._dummy = dummy = dummyarray.Array.from_desc( 0, shape, strides, itemsize ) - # confirm that all elements of shape are ints - if not all(isinstance(dim, (int, np.integer)) for dim in shape): - raise TypeError("all elements of shape must be ints") self.shape = shape = dummy.shape self.strides = strides = dummy.strides self.dtype = dtype diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py index 98bef04ea..a19e1e4f8 100644 --- a/numba_cuda/numba/cuda/cudadrv/dummyarray.py +++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py @@ -5,6 +5,7 @@ import itertools import functools import operator +import numpy as np Extent = namedtuple("Extent", ["begin", "end"]) @@ -249,6 +250,8 @@ class Array(object): def from_desc(cls, offset, shape, strides, itemsize): dims = [] for ashape, astride in zip(shape, strides): + if not isinstance(ashape, (int, np.integer)): + raise TypeError("all elements of shape must be ints") dim = Dim( offset, offset + ashape * astride, ashape, astride, single=False ) From e287ae25ce982ac116badf76ea97de99b7241f06 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:43:21 -0400 Subject: [PATCH 06/13] refactor: make shape and strides definitely tuples --- numba_cuda/numba/cuda/cudadrv/devicearray.py | 5 +++++ numba_cuda/numba/cuda/cudadrv/dummyarray.py | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index e44c1e598..1b3d5f08d 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -86,8 +86,13 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): """ if isinstance(shape, int): shape = (shape,) + else: + shape = tuple(shape) if isinstance(strides, int): strides = (strides,) + else: + if strides: + strides = tuple(strides) dtype = np.dtype(dtype) itemsize = dtype.itemsize self.ndim = ndim = len(shape) diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py index a19e1e4f8..2a344d8d1 100644 --- a/numba_cuda/numba/cuda/cudadrv/dummyarray.py +++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py @@ -446,8 +446,8 @@ def reshape(self, *newdims, **kws): ret = self.from_desc( self.extent.begin, - shape=newdims, - strides=newstrides, + shape=tuple(newdims), + strides=tuple(newstrides), itemsize=self.itemsize, ) @@ -475,8 +475,8 @@ def squeeze(self, axis=None): newstrides.append(stride) newarr = self.from_desc( self.extent.begin, - shape=newshape, - strides=newstrides, + shape=tuple(newshape), + strides=tuple(newstrides), itemsize=self.itemsize, ) return newarr, list(self.iter_contiguous_extent()) From e8c93e45e0bb71c7f7c43931b0eb61ed116933a8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:39:50 -0400 Subject: [PATCH 07/13] chore: remove extra tuple conversion --- numba_cuda/numba/cuda/api_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py index b8631da8d..ec685e27b 100644 --- a/numba_cuda/numba/cuda/api_util.py +++ b/numba_cuda/numba/cuda/api_util.py @@ -22,7 +22,7 @@ def prepare_shape_strides_dtype(shape, strides, dtype, order): strides = (strides,) else: if not strides: - strides = _fill_stride_by_order(tuple(shape), dtype, order) + strides = _fill_stride_by_order(shape, dtype, order) else: strides = tuple(strides) return shape, strides, dtype From 068cfbd4ef0b66f5436d7f2cc3a95231f102d543 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 23 Oct 2025 10:23:41 -0400 Subject: [PATCH 08/13] chore: make name longer for clarity --- numba_cuda/numba/cuda/api_util.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py index ec685e27b..6cc11e763 100644 --- a/numba_cuda/numba/cuda/api_util.py +++ b/numba_cuda/numba/cuda/api_util.py @@ -30,17 +30,17 @@ def prepare_shape_strides_dtype(shape, strides, dtype, order): @functools.cache def _fill_stride_by_order(shape, dtype, order): - nd = len(shape) - if nd == 0: + ndims = len(shape) + if ndims == 0: return () - strides = [0] * nd + strides = [0] * ndims if order == "C": strides[-1] = dtype.itemsize - for d in reversed(range(nd - 1)): + for d in reversed(range(ndims - 1)): strides[d] = strides[d + 1] * shape[d + 1] elif order == "F": strides[0] = dtype.itemsize - for d in range(1, nd): + for d in range(1, ndims): strides[d] = strides[d - 1] * shape[d - 1] else: raise ValueError("must be either C/F order") From 1729d82e8c091150a05bc353b2c5e18b7932c480 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 23 Oct 2025 10:23:59 -0400 Subject: [PATCH 09/13] chore: avoid any `__eq__` overhead --- numba_cuda/numba/cuda/api_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py index 6cc11e763..cd4800588 100644 --- a/numba_cuda/numba/cuda/api_util.py +++ b/numba_cuda/numba/cuda/api_util.py @@ -31,7 +31,7 @@ def prepare_shape_strides_dtype(shape, strides, dtype, order): @functools.cache def _fill_stride_by_order(shape, dtype, order): ndims = len(shape) - if ndims == 0: + if not ndims: return () strides = [0] * ndims if order == "C": From 848dfcc8787d0a19e9f70e8d88ca7487c01d5bff Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 23 Oct 2025 10:24:40 -0400 Subject: [PATCH 10/13] refactor: count backwards instead of reversing --- numba_cuda/numba/cuda/api_util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py index cd4800588..580b4eb20 100644 --- a/numba_cuda/numba/cuda/api_util.py +++ b/numba_cuda/numba/cuda/api_util.py @@ -36,7 +36,9 @@ def _fill_stride_by_order(shape, dtype, order): strides = [0] * ndims if order == "C": strides[-1] = dtype.itemsize - for d in reversed(range(ndims - 1)): + # -2 because we subtract one for zero-based indexing and another one + # for skipping the already-filled-in last element + for d in range(ndims - 2, -1, -1): strides[d] = strides[d + 1] * shape[d + 1] elif order == "F": strides[0] = dtype.itemsize From 1c9e7aa44dc67015c47067a119428447948b48b2 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 23 Oct 2025 11:59:59 -0400 Subject: [PATCH 11/13] perf: avoid extra dtype call --- numba_cuda/numba/cuda/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py index 752b20240..caacfd7c8 100644 --- a/numba_cuda/numba/cuda/api.py +++ b/numba_cuda/numba/cuda/api.py @@ -39,10 +39,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True): shape = desc["shape"] strides = desc.get("strides") - dtype = np.dtype(desc["typestr"]) shape, strides, dtype = prepare_shape_strides_dtype( - shape, strides, dtype, order="C" + shape, strides, desc["typestr"], order="C" ) size = driver.memory_size_from_info(shape, strides, dtype.itemsize) From 413137a75b4603bb54c6294e1851f2f2dfce891c Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 23 Oct 2025 12:00:12 -0400 Subject: [PATCH 12/13] perf: avoid extra cai access --- numba_cuda/numba/cuda/dispatcher.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 617562379..3d9f45ddf 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -1629,11 +1629,15 @@ def typeof_pyval(self, val): try: return typeof(val, Purpose.argument) except ValueError: - if cuda.is_cuda_array(val): + if ( + interface := getattr(val, "__cuda_array_interface__") + ) is not None: # When typing, we don't need to synchronize on the array's # stream - this is done when the kernel is launched. + return typeof( - cuda.as_cuda_array(val, sync=False), Purpose.argument + cuda.from_cuda_array_interface(interface, sync=False), + Purpose.argument, ) else: raise From 462ffe2bc5a84e9ed43881db0ff5870b22c3bf13 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Fri, 24 Oct 2025 09:45:42 -0400 Subject: [PATCH 13/13] perf: remove a redundant attribute access --- numba_cuda/numba/cuda/cudadrv/devicearray.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 1b3d5f08d..6cdb12424 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -123,17 +123,17 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): @property def __cuda_array_interface__(self): - if self.device_ctypes_pointer.value is not None: - ptr = self.device_ctypes_pointer.value + if (value := self.device_ctypes_pointer.value) is not None: + ptr = value else: ptr = 0 return { - "shape": tuple(self.shape), + "shape": self.shape, "strides": None if is_contiguous(self) else tuple(self.strides), "data": (ptr, False), "typestr": self.dtype.str, - "stream": int(self.stream) if self.stream != 0 else None, + "stream": int(stream) if (stream := self.stream) != 0 else None, "version": 3, }