NVIDIA · gmarkall · Oct 24, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py
@@ -39,10 +39,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
 
     shape = desc["shape"]
     strides = desc.get("strides")
-    dtype = np.dtype(desc["typestr"])
 
     shape, strides, dtype = prepare_shape_strides_dtype(
-        shape, strides, dtype, order="C"
+        shape, strides, desc["typestr"], order="C"
     )
     size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
 

diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 
+import functools
+
 
 def prepare_shape_strides_dtype(shape, strides, dtype, order):
     dtype = np.dtype(dtype)
@@ -14,25 +16,33 @@ def prepare_shape_strides_dtype(shape, strides, dtype, order):
         raise TypeError("shape must be an integer or tuple of integers")
     if isinstance(shape, int):
         shape = (shape,)
+    else:
+        shape = tuple(shape)
     if isinstance(strides, int):
         strides = (strides,)
     else:
-        strides = strides or _fill_stride_by_order(shape, dtype, order)
+        if not strides:
+            strides = _fill_stride_by_order(shape, dtype, order)
+        else:
+            strides = tuple(strides)
     return shape, strides, dtype
 
 
+@functools.cache
 def _fill_stride_by_order(shape, dtype, order):
-    nd = len(shape)
-    if nd == 0:
+    ndims = len(shape)
+    if not ndims:
         return ()
-    strides = [0] * nd
+    strides = [0] * ndims
     if order == "C":
         strides[-1] = dtype.itemsize
-        for d in reversed(range(nd - 1)):
+        # -2 because we subtract one for zero-based indexing and another one
+        # for skipping the already-filled-in last element
+        for d in range(ndims - 2, -1, -1):
             strides[d] = strides[d + 1] * shape[d + 1]
     elif order == "F":
         strides[0] = dtype.itemsize
-        for d in range(1, nd):
+        for d in range(1, ndims):
             strides[d] = strides[d - 1] * shape[d - 1]
     else:
         raise ValueError("must be either C/F order")

diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -86,8 +86,13 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         """
         if isinstance(shape, int):
             shape = (shape,)
+        else:
+            shape = tuple(shape)
         if isinstance(strides, int):
             strides = (strides,)
+        else:
+            if strides:
+                strides = tuple(strides)
         dtype = np.dtype(dtype)
         itemsize = dtype.itemsize
         self.ndim = ndim = len(shape)
@@ -96,9 +101,6 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         self._dummy = dummy = dummyarray.Array.from_desc(
             0, shape, strides, itemsize
         )
-        # confirm that all elements of shape are ints
-        if not all(isinstance(dim, (int, np.integer)) for dim in shape):
-            raise TypeError("all elements of shape must be ints")
         self.shape = shape = dummy.shape
         self.strides = strides = dummy.strides
         self.dtype = dtype
@@ -121,17 +123,17 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
 
     @property
     def __cuda_array_interface__(self):
-        if self.device_ctypes_pointer.value is not None:
-            ptr = self.device_ctypes_pointer.value
+        if (value := self.device_ctypes_pointer.value) is not None:
+            ptr = value
         else:
             ptr = 0
 
         return {
-            "shape": tuple(self.shape),
+            "shape": self.shape,
             "strides": None if is_contiguous(self) else tuple(self.strides),
             "data": (ptr, False),
             "typestr": self.dtype.str,
-            "stream": int(self.stream) if self.stream != 0 else None,
+            "stream": int(stream) if (stream := self.stream) != 0 else None,
             "version": 3,
         }
 

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -3023,6 +3023,7 @@ def host_memory_extents(obj):
     return mviewbuf.memoryview_get_extents(obj)
 
 
+@functools.cache
 def memory_size_from_info(shape, strides, itemsize):
     """Get the byte size of a contiguous memory buffer given the shape, strides
     and itemsize.

diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py
@@ -5,6 +5,7 @@
 import itertools
 import functools
 import operator
+import numpy as np
 
 
 Extent = namedtuple("Extent", ["begin", "end"])
@@ -245,9 +246,12 @@ class Array(object):
     is_array = True
 
     @classmethod
+    @functools.cache
     def from_desc(cls, offset, shape, strides, itemsize):
         dims = []
         for ashape, astride in zip(shape, strides):
+            if not isinstance(ashape, (int, np.integer)):
+                raise TypeError("all elements of shape must be ints")
             dim = Dim(
                 offset, offset + ashape * astride, ashape, astride, single=False
             )
@@ -442,8 +446,8 @@ def reshape(self, *newdims, **kws):
 
         ret = self.from_desc(
             self.extent.begin,
-            shape=newdims,
-            strides=newstrides,
+            shape=tuple(newdims),
+            strides=tuple(newstrides),
             itemsize=self.itemsize,
         )
 
@@ -471,8 +475,8 @@ def squeeze(self, axis=None):
                     newstrides.append(stride)
         newarr = self.from_desc(
             self.extent.begin,
-            shape=newshape,
-            strides=newstrides,
+            shape=tuple(newshape),
+            strides=tuple(newstrides),
             itemsize=self.itemsize,
         )
         return newarr, list(self.iter_contiguous_extent())

diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -1629,11 +1629,15 @@ def typeof_pyval(self, val):
         try:
             return typeof(val, Purpose.argument)
         except ValueError:
-            if cuda.is_cuda_array(val):
+            if (
+                interface := getattr(val, "__cuda_array_interface__")
+            ) is not None:
                 # When typing, we don't need to synchronize on the array's
                 # stream - this is done when the kernel is launched.
+
                 return typeof(
-                    cuda.as_cuda_array(val, sync=False), Purpose.argument
+                    cuda.from_cuda_array_interface(interface, sync=False),
+                    Purpose.argument,
                 )
             else:
                 raise

diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -198,7 +198,7 @@ benchcmp = { cmd = [
     "numba.cuda.tests.benchmarks",
     "--benchmark-only",
     "--benchmark-enable",
-    "--benchmark-group-by=func",
+    "--benchmark-group-by=name",
     "--benchmark-compare",
 ] }