From beeadd1975f41c6b1fcc8fa6ad2a2a00aa92bb03 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 14 Oct 2025 08:26:43 -0700
Subject: [PATCH 01/21] basic test, bits and pieces of nrt needed

---
 numba_cuda/numba/cuda/cpython/listobj.py      |   2 +-
 .../numba/cuda/memory_management/nrt.cu       |  37 ++-
 .../numba/cuda/memory_management/nrt.cuh      |   1 +
 numba_cuda/numba/cuda/target.py               |  18 +-
 .../numba/cuda/tests/test_array_reductions.py | 297 ++++++++++++++++++
 numba_cuda/numba/cuda/typing/context.py       |   2 +-
 6 files changed, 348 insertions(+), 9 deletions(-)
 create mode 100644 numba_cuda/numba/cuda/tests/test_array_reductions.py

diff --git a/numba_cuda/numba/cuda/cpython/listobj.py b/numba_cuda/numba/cuda/cpython/listobj.py
index bd1dbe1f9..7f7834048 100644
--- a/numba_cuda/numba/cuda/cpython/listobj.py
+++ b/numba_cuda/numba/cuda/cpython/listobj.py
@@ -300,7 +300,7 @@ def define_dtor(self):
         # Declare dtor
         fnty = ir.FunctionType(ir.VoidType(), [cgutils.voidptr_t])
         fn = cgutils.get_or_insert_function(
-            mod, fnty, ".dtor.list.{}".format(self.dtype)
+            mod, fnty, "numba_cuda_dtor_list_{}".format(self.dtype)
         )
         if not fn.is_declaration:
             # End early if the dtor is already defined
diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cu b/numba_cuda/numba/cuda/memory_management/nrt.cu
index 729c8d8c4..25da96847 100644
--- a/numba_cuda/numba/cuda/memory_management/nrt.cu
+++ b/numba_cuda/numba/cuda/memory_management/nrt.cu
@@ -78,6 +78,39 @@ extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
   NRT_MemInfo_destroy(mi);
 }
 
+static void __device__
+nrt_varsize_dtor(void *ptr, size_t size, void *info) {
+    if (info) {
+        /* call element dtor */
+        typedef void dtor_fn_t(void *ptr);
+        dtor_fn_t *dtor = (dtor_fn_t *)info;
+        dtor(ptr);
+    }
+    NRT_Free(ptr);
+}
+
+__device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size)
+{
+    NRT_MemInfo *mi = NULL;
+    void *data = NRT_Allocate(size);
+    if (data == NULL) {
+        return NULL; /* return early as allocation failed */
+    }
+
+    mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL);
+
+    return mi;
+}
+
+extern "C"
+__device__ NRT_MemInfo* NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor) {
+    NRT_MemInfo *mi = NRT_MemInfo_new_varsize(size);
+    if (mi) {
+        mi->dtor_info = (void*)dtor;
+    }
+    return mi;
+}
+
 extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
 {
   return mi->data;
@@ -167,7 +200,7 @@ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
 }
 
 
-#endif
+
 
 extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
 {
@@ -175,3 +208,5 @@ extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
     mi->refct++;
   }
 }
+
+#endif
diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cuh b/numba_cuda/numba/cuda/memory_management/nrt.cuh
index b79dbc7fd..ab7ae5ef7 100644
--- a/numba_cuda/numba/cuda/memory_management/nrt.cuh
+++ b/numba_cuda/numba/cuda/memory_management/nrt.cuh
@@ -44,3 +44,4 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
                                             size_t size,
                                             NRT_dtor_function dtor,
                                             void* dtor_info);
+extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor);
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index c2db1a442..819470e51 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -36,7 +36,7 @@
 # Typing
 
 
-class CUDATypingContext(typing.BaseContext):
+class CUDATypingContext(typing.Context):
     def load_additional_registries(self):
         from . import (
             cudadecl,
@@ -57,6 +57,7 @@ def load_additional_registries(self):
         self.install_registry(vector_types.typing_registry)
         self.install_registry(fp16.typing_registry)
         self.install_registry(bf16.typing_registry)
+        super().load_additional_registries()
 
     def resolve_value_type(self, val):
         # treat other dispatcher object as another device function
@@ -171,11 +172,7 @@ def load_additional_registries(self):
         from numba.cuda.cpython import builtins as cpython_builtins
         from numba.cuda.core import optional  # noqa: F401
         from numba.cuda.misc import cffiimpl
-        from numba.cuda.np import (
-            arrayobj,
-            npdatetime,
-            polynomial,
-        )
+        from numba.cuda.np import arrayobj, npdatetime, polynomial, arraymath
         from . import (
             cudaimpl,
             fp16,
@@ -215,6 +212,7 @@ def load_additional_registries(self):
         self.install_registry(polynomial.registry)
         self.install_registry(npdatetime.registry)
         self.install_registry(arrayobj.registry)
+        self.install_registry(arraymath.registry)
 
     def codegen(self):
         return self._internal_codegen
@@ -225,6 +223,14 @@ def target_data(self):
             self._target_data = ll.create_target_data(nvvm.NVVM().data_layout)
         return self._target_data
 
+    def build_list(self, builder, list_type, items):
+        """
+        Build a list from the Numba *list_type* and its initial *items*.
+        """
+        from numba.cuda.cpython import listobj
+
+        return listobj.build_list(self, builder, list_type, items)
+
     @cached_property
     def nonconst_module_attrs(self):
         """
diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
new file mode 100644
index 000000000..41c8b471a
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -0,0 +1,297 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+import numpy as np
+
+from numba import jit, njit
+from numba.tests.support import TestCase, MemoryLeakMixin
+from numba import cuda
+
+
+def array_all(arr):
+    return arr.all()
+
+
+def array_all_global(arr):
+    return np.all(arr)
+
+
+def array_any(arr):
+    return arr.any()
+
+
+def array_any_global(arr):
+    return np.any(arr)
+
+
+def array_cumprod(arr):
+    return arr.cumprod()
+
+
+def array_cumprod_global(arr):
+    return np.cumprod(arr)
+
+
+def array_nancumprod(arr):
+    return np.nancumprod(arr)
+
+
+def array_cumsum(arr):
+    return arr.cumsum()
+
+
+def array_cumsum_global(arr):
+    return np.cumsum(arr)
+
+
+def array_nancumsum(arr):
+    return np.nancumsum(arr)
+
+
+def array_sum(arr):
+    return arr.sum()
+
+
+def array_sum_global(arr):
+    return np.sum(arr)
+
+
+def array_prod(arr):
+    return arr.prod()
+
+
+def array_prod_global(arr):
+    return np.prod(arr)
+
+
+def array_mean(arr):
+    return arr.mean()
+
+
+def array_mean_global(arr):
+    return np.mean(arr)
+
+
+def array_var(arr):
+    return arr.var()
+
+
+def array_var_global(arr):
+    return np.var(arr)
+
+
+def array_std(arr):
+    return arr.std()
+
+
+def array_std_global(arr):
+    return np.std(arr)
+
+
+def array_min(arr):
+    return arr.min()
+
+
+def array_min_global(arr):
+    return np.min(arr)
+
+
+def array_amin(arr):
+    return np.amin(arr)
+
+
+def array_max(arr):
+    return arr.max()
+
+
+def array_max_global(arr):
+    return np.max(arr)
+
+
+def array_amax(arr):
+    return np.amax(arr)
+
+
+def array_argmin(arr):
+    return arr.argmin()
+
+
+def array_argmin_global(arr):
+    return np.argmin(arr)
+
+
+def array_argmax(arr):
+    return arr.argmax()
+
+
+def array_argmax_global(arr):
+    return np.argmax(arr)
+
+
+def array_median_global(arr):
+    return np.median(arr)
+
+
+def array_nanmin(arr):
+    return np.nanmin(arr)
+
+
+def array_nanmax(arr):
+    return np.nanmax(arr)
+
+
+def array_nanmean(arr):
+    return np.nanmean(arr)
+
+
+def array_nansum(arr):
+    return np.nansum(arr)
+
+
+def array_nanprod(arr):
+    return np.nanprod(arr)
+
+
+def array_nanstd(arr):
+    return np.nanstd(arr)
+
+
+def array_nanvar(arr):
+    return np.nanvar(arr)
+
+
+def array_nanmedian_global(arr):
+    return np.nanmedian(arr)
+
+
+def array_percentile_global(arr, q):
+    return np.percentile(arr, q)
+
+
+def array_nanpercentile_global(arr, q):
+    return np.nanpercentile(arr, q)
+
+
+def array_ptp_global(a):
+    return np.ptp(a)
+
+
+def array_ptp(a):
+    return a.ptp()
+
+
+def array_quantile_global(arr, q):
+    return np.quantile(arr, q)
+
+
+def array_nanquantile_global(arr, q):
+    return np.nanquantile(arr, q)
+
+
+def base_test_arrays(dtype):
+    if dtype == np.bool_:
+
+        def factory(n):
+            assert n % 2 == 0
+            return np.bool_([0, 1] * (n // 2))
+    else:
+
+        def factory(n):
+            return np.arange(n, dtype=dtype) + 1
+
+    a1 = factory(10)
+    a2 = factory(10).reshape(2, 5)
+    # The prod() of this array fits in a 32-bit int
+    a3 = (factory(12))[::-1].reshape((2, 3, 2), order="A")
+    assert not (a3.flags.c_contiguous or a3.flags.f_contiguous)
+
+    return [a1, a2, a3]
+
+
+def full_test_arrays(dtype):
+    array_list = base_test_arrays(dtype)
+
+    # Add floats with some mantissa
+    if dtype == np.float32:
+        array_list += [a / 10 for a in array_list]
+
+    # add imaginary part
+    if dtype == np.complex64:
+        acc = []
+        for a in array_list:
+            tmp = a / 10 + 1j * a / 11
+            tmp[::2] = np.conj(tmp[::2])
+            acc.append(tmp)
+        array_list.extend(acc)
+
+    for a in array_list:
+        assert a.dtype == np.dtype(dtype)
+    return array_list
+
+
+def run_comparative(compare_func, test_array):
+    cfunc = njit(compare_func)
+    numpy_result = compare_func(test_array)
+    numba_result = cfunc(test_array)
+
+    return numpy_result, numba_result
+
+
+class TestArrayReductions(MemoryLeakMixin, TestCase):
+    """
+    Test array reduction methods and functions such as .sum(), .max(), etc.
+    """
+
+    def setUp(self):
+        super(TestArrayReductions, self).setUp()
+        np.random.seed(42)
+
+    def check_reduction_basic(self, pyfunc, **kwargs):
+        # Basic reduction checks on 1-d float64 arrays
+        cfunc = jit(nopython=True)(pyfunc)
+
+        def check(arr):
+            self.assertPreciseEqual(pyfunc(arr), cfunc(arr), **kwargs)
+
+        arr = np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5])
+        check(arr)
+        arr = np.float64([-0.0, -1.5])
+        check(arr)
+        arr = np.float64([-1.5, 2.5, "inf"])
+        check(arr)
+        arr = np.float64([-1.5, 2.5, "-inf"])
+        check(arr)
+        arr = np.float64([-1.5, 2.5, "inf", "-inf"])
+        check(arr)
+        arr = np.float64(["nan", -1.5, 2.5, "nan", 3.0])
+        check(arr)
+        arr = np.float64(["nan", -1.5, 2.5, "nan", "inf", "-inf", 3.0])
+        check(arr)
+        arr = np.float64([5.0, "nan", -1.5, "nan"])
+        check(arr)
+        # Only NaNs
+        arr = np.float64(["nan", "nan"])
+        check(arr)
+
+    def test_all_basic(self):
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid < 1:
+                ary = np.float64([1.0, 0.0, float("inf"), float("nan")])
+                out[0] = np.all(ary)
+
+        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
+        kernel[1, 1](out)
+
+        arr = np.float64([1.0, 0.0, float("inf"), float("nan")])
+        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
+
+        # arr = np.float64([1.0, 0.0, float('inf'), float('nan')])
+        # check(arr)
+        # arr[1] = -0.0
+        # check(arr)
+        # arr[1] = 1.5
+        # check(arr)
+        # arr = arr.reshape((2, 2))
+        # check(arr)
+        # check(arr[::-1])
diff --git a/numba_cuda/numba/cuda/typing/context.py b/numba_cuda/numba/cuda/typing/context.py
index e511de2cb..d278e7df2 100644
--- a/numba_cuda/numba/cuda/typing/context.py
+++ b/numba_cuda/numba/cuda/typing/context.py
@@ -474,7 +474,7 @@ def is_for_this_target(ftcls):
             else:
                 # A type was already inserted, see if we can add to it
                 newty = existing.augment(gty)
-                if newty is None:
+                if newty is None and existing != gty:
                     raise TypeError(
                         "cannot augment %s with %s" % (existing, gty)
                     )

From 634e976e5c6adb9668d428d28ad390c94c1d1ee9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 14 Oct 2025 13:30:23 -0700
Subject: [PATCH 02/21] some progress

---
 numba_cuda/numba/cuda/cpython/listobj.py      |  6 +-
 .../numba/cuda/tests/test_array_reductions.py | 60 ++++++++++++++++---
 2 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/numba_cuda/numba/cuda/cpython/listobj.py b/numba_cuda/numba/cuda/cpython/listobj.py
index 7f7834048..2a8c661c7 100644
--- a/numba_cuda/numba/cuda/cpython/listobj.py
+++ b/numba_cuda/numba/cuda/cpython/listobj.py
@@ -299,8 +299,12 @@ def define_dtor(self):
         mod = builder.module
         # Declare dtor
         fnty = ir.FunctionType(ir.VoidType(), [cgutils.voidptr_t])
+        if isinstance(self.dtype, types.containers.List):
+            dtypestr = f"list_{self.dtype.dtype}"
+        else:
+            dtypestr = str(self.dtype)
         fn = cgutils.get_or_insert_function(
-            mod, fnty, "numba_cuda_dtor_list_{}".format(self.dtype)
+            mod, fnty, "numba_cuda_dtor_list_{}".format(dtypestr)
         )
         if not fn.is_declaration:
             # End early if the dtor is already defined
diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index 41c8b471a..cf829c36d 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -286,12 +286,54 @@ def kernel(out):
         arr = np.float64([1.0, 0.0, float("inf"), float("nan")])
         self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
 
-        # arr = np.float64([1.0, 0.0, float('inf'), float('nan')])
-        # check(arr)
-        # arr[1] = -0.0
-        # check(arr)
-        # arr[1] = 1.5
-        # check(arr)
-        # arr = arr.reshape((2, 2))
-        # check(arr)
-        # check(arr[::-1])
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid < 1:
+                ary = np.float64([1.0, -0.0, float("inf"), float("nan")])
+                out[0] = np.all(ary)
+
+        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
+        kernel[1, 1](out)
+
+        arr = np.float64([1.0, -0.0, float("inf"), float("nan")])
+        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
+
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid < 1:
+                ary = np.float64([1.0, 1.5, float("inf"), float("nan")])
+                out[0] = np.all(ary)
+
+        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
+        kernel[1, 1](out)
+
+        arr = np.float64([1.0, 1.5, float("inf"), float("nan")])
+        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
+
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid < 1:
+                ary = np.float64([[1.0, 1.5], [float("inf"), float("nan")]])
+                out[0] = np.all(ary)
+
+        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
+        kernel[1, 1](out)
+
+        arr = np.float64([[1.0, 1.5], [float("inf"), float("nan")]])
+        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
+
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid < 1:
+                ary = np.float64([[float("inf"), float("nan")], [1.5, 1.0]])
+                out[0] = np.all(ary)
+
+        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
+        kernel[1, 1](out)
+
+        arr = np.float64([[float("inf"), float("nan")], [1.5, 1.0]])
+        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])

From 9e12144be188952d5d51bc0cdc6b5569168c3e7f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 15 Oct 2025 06:04:45 -0700
Subject: [PATCH 03/21] Refactor

---
 .../numba/cuda/tests/test_array_reductions.py | 330 +-----------------
 1 file changed, 15 insertions(+), 315 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index cf829c36d..7bb443357 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -2,240 +2,10 @@
 # SPDX-License-Identifier: BSD-2-Clause
 import numpy as np
 
-from numba import jit, njit
 from numba.tests.support import TestCase, MemoryLeakMixin
 from numba import cuda
 
 
-def array_all(arr):
-    return arr.all()
-
-
-def array_all_global(arr):
-    return np.all(arr)
-
-
-def array_any(arr):
-    return arr.any()
-
-
-def array_any_global(arr):
-    return np.any(arr)
-
-
-def array_cumprod(arr):
-    return arr.cumprod()
-
-
-def array_cumprod_global(arr):
-    return np.cumprod(arr)
-
-
-def array_nancumprod(arr):
-    return np.nancumprod(arr)
-
-
-def array_cumsum(arr):
-    return arr.cumsum()
-
-
-def array_cumsum_global(arr):
-    return np.cumsum(arr)
-
-
-def array_nancumsum(arr):
-    return np.nancumsum(arr)
-
-
-def array_sum(arr):
-    return arr.sum()
-
-
-def array_sum_global(arr):
-    return np.sum(arr)
-
-
-def array_prod(arr):
-    return arr.prod()
-
-
-def array_prod_global(arr):
-    return np.prod(arr)
-
-
-def array_mean(arr):
-    return arr.mean()
-
-
-def array_mean_global(arr):
-    return np.mean(arr)
-
-
-def array_var(arr):
-    return arr.var()
-
-
-def array_var_global(arr):
-    return np.var(arr)
-
-
-def array_std(arr):
-    return arr.std()
-
-
-def array_std_global(arr):
-    return np.std(arr)
-
-
-def array_min(arr):
-    return arr.min()
-
-
-def array_min_global(arr):
-    return np.min(arr)
-
-
-def array_amin(arr):
-    return np.amin(arr)
-
-
-def array_max(arr):
-    return arr.max()
-
-
-def array_max_global(arr):
-    return np.max(arr)
-
-
-def array_amax(arr):
-    return np.amax(arr)
-
-
-def array_argmin(arr):
-    return arr.argmin()
-
-
-def array_argmin_global(arr):
-    return np.argmin(arr)
-
-
-def array_argmax(arr):
-    return arr.argmax()
-
-
-def array_argmax_global(arr):
-    return np.argmax(arr)
-
-
-def array_median_global(arr):
-    return np.median(arr)
-
-
-def array_nanmin(arr):
-    return np.nanmin(arr)
-
-
-def array_nanmax(arr):
-    return np.nanmax(arr)
-
-
-def array_nanmean(arr):
-    return np.nanmean(arr)
-
-
-def array_nansum(arr):
-    return np.nansum(arr)
-
-
-def array_nanprod(arr):
-    return np.nanprod(arr)
-
-
-def array_nanstd(arr):
-    return np.nanstd(arr)
-
-
-def array_nanvar(arr):
-    return np.nanvar(arr)
-
-
-def array_nanmedian_global(arr):
-    return np.nanmedian(arr)
-
-
-def array_percentile_global(arr, q):
-    return np.percentile(arr, q)
-
-
-def array_nanpercentile_global(arr, q):
-    return np.nanpercentile(arr, q)
-
-
-def array_ptp_global(a):
-    return np.ptp(a)
-
-
-def array_ptp(a):
-    return a.ptp()
-
-
-def array_quantile_global(arr, q):
-    return np.quantile(arr, q)
-
-
-def array_nanquantile_global(arr, q):
-    return np.nanquantile(arr, q)
-
-
-def base_test_arrays(dtype):
-    if dtype == np.bool_:
-
-        def factory(n):
-            assert n % 2 == 0
-            return np.bool_([0, 1] * (n // 2))
-    else:
-
-        def factory(n):
-            return np.arange(n, dtype=dtype) + 1
-
-    a1 = factory(10)
-    a2 = factory(10).reshape(2, 5)
-    # The prod() of this array fits in a 32-bit int
-    a3 = (factory(12))[::-1].reshape((2, 3, 2), order="A")
-    assert not (a3.flags.c_contiguous or a3.flags.f_contiguous)
-
-    return [a1, a2, a3]
-
-
-def full_test_arrays(dtype):
-    array_list = base_test_arrays(dtype)
-
-    # Add floats with some mantissa
-    if dtype == np.float32:
-        array_list += [a / 10 for a in array_list]
-
-    # add imaginary part
-    if dtype == np.complex64:
-        acc = []
-        for a in array_list:
-            tmp = a / 10 + 1j * a / 11
-            tmp[::2] = np.conj(tmp[::2])
-            acc.append(tmp)
-        array_list.extend(acc)
-
-    for a in array_list:
-        assert a.dtype == np.dtype(dtype)
-    return array_list
-
-
-def run_comparative(compare_func, test_array):
-    cfunc = njit(compare_func)
-    numpy_result = compare_func(test_array)
-    numba_result = cfunc(test_array)
-
-    return numpy_result, numba_result
-
-
 class TestArrayReductions(MemoryLeakMixin, TestCase):
     """
     Test array reduction methods and functions such as .sum(), .max(), etc.
@@ -245,95 +15,25 @@ def setUp(self):
         super(TestArrayReductions, self).setUp()
         np.random.seed(42)
 
-    def check_reduction_basic(self, pyfunc, **kwargs):
-        # Basic reduction checks on 1-d float64 arrays
-        cfunc = jit(nopython=True)(pyfunc)
-
-        def check(arr):
-            self.assertPreciseEqual(pyfunc(arr), cfunc(arr), **kwargs)
-
-        arr = np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5])
-        check(arr)
-        arr = np.float64([-0.0, -1.5])
-        check(arr)
-        arr = np.float64([-1.5, 2.5, "inf"])
-        check(arr)
-        arr = np.float64([-1.5, 2.5, "-inf"])
-        check(arr)
-        arr = np.float64([-1.5, 2.5, "inf", "-inf"])
-        check(arr)
-        arr = np.float64(["nan", -1.5, 2.5, "nan", 3.0])
-        check(arr)
-        arr = np.float64(["nan", -1.5, 2.5, "nan", "inf", "-inf", 3.0])
-        check(arr)
-        arr = np.float64([5.0, "nan", -1.5, "nan"])
-        check(arr)
-        # Only NaNs
-        arr = np.float64(["nan", "nan"])
-        check(arr)
-
     def test_all_basic(self):
-        @cuda.jit
-        def kernel(out):
-            gid = cuda.grid(1)
-            if gid < 1:
-                ary = np.float64([1.0, 0.0, float("inf"), float("nan")])
-                out[0] = np.all(ary)
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.all(arr)
 
-        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
-        kernel[1, 1](out)
+            out = cuda.to_device(np.zeros(1, dtype=np.bool_))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
 
         arr = np.float64([1.0, 0.0, float("inf"), float("nan")])
-        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
-
-        @cuda.jit
-        def kernel(out):
-            gid = cuda.grid(1)
-            if gid < 1:
-                ary = np.float64([1.0, -0.0, float("inf"), float("nan")])
-                out[0] = np.all(ary)
-
-        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
-        kernel[1, 1](out)
-
+        check(arr)
         arr = np.float64([1.0, -0.0, float("inf"), float("nan")])
-        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
-
-        @cuda.jit
-        def kernel(out):
-            gid = cuda.grid(1)
-            if gid < 1:
-                ary = np.float64([1.0, 1.5, float("inf"), float("nan")])
-                out[0] = np.all(ary)
-
-        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
-        kernel[1, 1](out)
-
+        check(arr)
         arr = np.float64([1.0, 1.5, float("inf"), float("nan")])
-        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
-
-        @cuda.jit
-        def kernel(out):
-            gid = cuda.grid(1)
-            if gid < 1:
-                ary = np.float64([[1.0, 1.5], [float("inf"), float("nan")]])
-                out[0] = np.all(ary)
-
-        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
-        kernel[1, 1](out)
-
+        check(arr)
         arr = np.float64([[1.0, 1.5], [float("inf"), float("nan")]])
-        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
-
-        @cuda.jit
-        def kernel(out):
-            gid = cuda.grid(1)
-            if gid < 1:
-                ary = np.float64([[float("inf"), float("nan")], [1.5, 1.0]])
-                out[0] = np.all(ary)
-
-        out = cuda.to_device(np.zeros(1, dtype=np.bool_))
-        kernel[1, 1](out)
-
-        arr = np.float64([[float("inf"), float("nan")], [1.5, 1.0]])
-        self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
+        check(arr)
+        arr = np.float64([[1.0, 1.5], [1.5, 1.0]])
+        check(arr)

From c4d4abe45b32545c60770a2f8c9f54fb54206dba Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 15 Oct 2025 06:14:38 -0700
Subject: [PATCH 04/21] more reductions

---
 .../numba/cuda/tests/test_array_reductions.py | 374 ++++++++++++++++++
 1 file changed, 374 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index 7bb443357..29df509f3 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -37,3 +37,377 @@ def kernel(out):
         check(arr)
         arr = np.float64([[1.0, 1.5], [1.5, 1.0]])
         check(arr)
+
+    def test_any_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.any(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.bool_))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.any(arr), out.copy_to_host()[0])
+
+        arr = np.float64([0.0, -0.0, 0.0, 0.0])
+        check(arr)
+        arr[2] = float("nan")
+        check(arr)
+        arr[2] = float("inf")
+        check(arr)
+        arr[2] = 1.5
+        check(arr)
+        arr = arr.reshape((2, 2))
+        check(arr)
+        check(arr[::-1])
+
+    def test_sum_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.sum(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.sum(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64(
+                [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0]
+            ),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_mean_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.mean(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.mean(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64(
+                [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0]
+            ),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_var_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.var(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.var(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64(
+                [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0]
+            ),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_std_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.std(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.std(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64(
+                [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0]
+            ),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_min_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.min(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.min(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64(
+                [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0]
+            ),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_max_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.max(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.max(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64(
+                [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0]
+            ),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_nanmin_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.nanmin(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.nanmin(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, np.nan]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_nanmax_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.nanmax(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.nanmax(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, np.nan]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_nanmean_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.nanmean(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.nanmean(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, np.nan]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64(
+                [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0]
+            ),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_nansum_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.nansum(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.nansum(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, np.nan]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_nanprod_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.nanprod(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.nanprod(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, np.nan]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_nanstd_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.nanstd(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.nanstd(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, np.nan]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)
+
+    def test_nanvar_basic(self):
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    out[0] = np.nanvar(arr)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+            self.assertPreciseEqual(np.nanvar(arr), out.copy_to_host()[0])
+
+        arrays = [
+            np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
+            np.float64([-0.0, -1.5]),
+            np.float64([-1.5, 2.5, np.nan]),
+            np.float64([-1.5, 2.5, float("inf")]),
+            np.float64([-1.5, 2.5, -float("inf")]),
+            np.float64([-1.5, 2.5, float("inf"), -float("inf")]),
+            np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]),
+            np.float64([5.0, np.nan, -1.5, np.nan]),
+            np.float64([np.nan, np.nan]),
+        ]
+        for arr in arrays:
+            check(arr)

From 06226f20cd6e1528e6b8dd8c3a74ab082a5c734f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 15 Oct 2025 07:52:55 -0700
Subject: [PATCH 05/21] fix ufuncs

---
 numba_cuda/numba/cuda/ufuncs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py
index f305d3ee4..a58dc9bce 100644
--- a/numba_cuda/numba/cuda/ufuncs.py
+++ b/numba_cuda/numba/cuda/ufuncs.py
@@ -17,6 +17,7 @@
     get_unary_impl_for_fn_and_ty,
     get_binary_impl_for_fn_and_ty,
 )
+from numba.cuda.np.ufunc_db import _fill_ufunc_db
 
 
 def get_ufunc_info(ufunc_key):
@@ -682,4 +683,6 @@ def np_real_atanh_impl(context, builder, sig, args):
         "D->D": npyfuncs.np_complex_log10_impl,
     }
 
+    _fill_ufunc_db(db)
+
     return db

From 661180525caf2e01590b534e62f29103d448de5c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 15 Oct 2025 10:59:41 -0700
Subject: [PATCH 06/21] enable nrt

---
 numba_cuda/numba/cuda/tests/test_array_reductions.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index 29df509f3..617f9d753 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -4,6 +4,7 @@
 
 from numba.tests.support import TestCase, MemoryLeakMixin
 from numba import cuda
+from numba.cuda import config
 
 
 class TestArrayReductions(MemoryLeakMixin, TestCase):
@@ -14,6 +15,12 @@ class TestArrayReductions(MemoryLeakMixin, TestCase):
     def setUp(self):
         super(TestArrayReductions, self).setUp()
         np.random.seed(42)
+        self.old_nrt_setting = config.CUDA_ENABLE_NRT
+        config.CUDA_ENABLE_NRT = True
+
+    def tearDown(self):
+        config.CUDA_ENABLE_NRT = self.old_nrt_setting
+        super(TestArrayReductions, self).tearDown()
 
     def test_all_basic(self):
         def check(arr):

From 7bc54a0954093247f444f1808dae9eec396e1412 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 15 Oct 2025 15:24:40 -0700
Subject: [PATCH 07/21] fixes

---
 numba_cuda/numba/cuda/ufuncs.py | 63 +++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py
index 0a3f8c43b..6133cd25f 100644
--- a/numba_cuda/numba/cuda/ufuncs.py
+++ b/numba_cuda/numba/cuda/ufuncs.py
@@ -17,7 +17,6 @@
     get_unary_impl_for_fn_and_ty,
     get_binary_impl_for_fn_and_ty,
 )
-from numba.cuda.np.ufunc_db import _fill_ufunc_db
 
 
 def get_ufunc_info(ufunc_key):
@@ -683,6 +682,66 @@ def np_real_atanh_impl(context, builder, sig, args):
         "D->D": npyfuncs.np_complex_log10_impl,
     }
 
-    _fill_ufunc_db(db)
+    db[np.isnan] = {
+        "f->?": npyfuncs.np_real_isnan_impl,
+        "d->?": npyfuncs.np_real_isnan_impl,
+        "F->?": npyfuncs.np_complex_isnan_impl,
+        "D->?": npyfuncs.np_complex_isnan_impl,
+        # int8
+        "b->?": npyfuncs.np_int_isnan_impl,
+        "B->?": npyfuncs.np_int_isnan_impl,
+        # int16
+        "h->?": npyfuncs.np_int_isnan_impl,
+        "H->?": npyfuncs.np_int_isnan_impl,
+        # int32
+        "i->?": npyfuncs.np_int_isnan_impl,
+        "I->?": npyfuncs.np_int_isnan_impl,
+        # int64
+        "l->?": npyfuncs.np_int_isnan_impl,
+        "L->?": npyfuncs.np_int_isnan_impl,
+        # intp
+        "q->?": npyfuncs.np_int_isnan_impl,
+        "Q->?": npyfuncs.np_int_isnan_impl,
+        # boolean
+        "?->?": npyfuncs.np_int_isnan_impl,
+        # datetime & timedelta
+        "m->?": npyfuncs.np_datetime_isnat_impl,
+        "M->?": npyfuncs.np_datetime_isnat_impl,
+    }
+
+    db[np.conjugate] = {
+        "b->b": numbers.real_conjugate_impl,
+        "B->B": numbers.real_conjugate_impl,
+        "h->h": numbers.real_conjugate_impl,
+        "H->H": numbers.real_conjugate_impl,
+        "i->i": numbers.real_conjugate_impl,
+        "I->I": numbers.real_conjugate_impl,
+        "l->l": numbers.real_conjugate_impl,
+        "L->L": numbers.real_conjugate_impl,
+        "q->q": numbers.real_conjugate_impl,
+        "Q->Q": numbers.real_conjugate_impl,
+        "f->f": numbers.real_conjugate_impl,
+        "d->d": numbers.real_conjugate_impl,
+        "F->F": numbers.complex_conjugate_impl,
+        "D->D": numbers.complex_conjugate_impl,
+    }
+
+    if np.divide != np.true_divide:
+        db[np.divide] = {
+            "bb->b": npyfuncs.np_int_sdiv_impl,
+            "BB->B": npyfuncs.np_int_udiv_impl,
+            "hh->h": npyfuncs.np_int_sdiv_impl,
+            "HH->H": npyfuncs.np_int_udiv_impl,
+            "ii->i": npyfuncs.np_int_sdiv_impl,
+            "II->I": npyfuncs.np_int_udiv_impl,
+            "ll->l": npyfuncs.np_int_sdiv_impl,
+            "LL->L": npyfuncs.np_int_udiv_impl,
+            "qq->q": npyfuncs.np_int_sdiv_impl,
+            "QQ->Q": npyfuncs.np_int_udiv_impl,
+            "ff->f": npyfuncs.np_real_div_impl,
+            "dd->d": npyfuncs.np_real_div_impl,
+            "FF->F": npyfuncs.np_complex_div_impl,
+            "DD->D": npyfuncs.np_complex_div_impl,
+        }
 
     return db

From 8f87a4425f030b2f6f0bba84995c9c873cd3a317 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 16 Oct 2025 06:07:03 -0700
Subject: [PATCH 08/21] pass

---
 .../numba/cuda/tests/test_array_reductions.py |  8 +++--
 numba_cuda/numba/cuda/ufuncs.py               | 33 +++++++++----------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index 617f9d753..d484cf5c3 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -135,7 +135,9 @@ def kernel(out):
 
             out = cuda.to_device(np.zeros(1, dtype=np.float64))
             kernel[1, 1](out)
-            self.assertPreciseEqual(np.var(arr), out.copy_to_host()[0])
+            self.assertPreciseEqual(
+                np.var(arr), out.copy_to_host()[0], prec="double"
+            )
 
         arrays = [
             np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
@@ -403,7 +405,9 @@ def kernel(out):
 
             out = cuda.to_device(np.zeros(1, dtype=np.float64))
             kernel[1, 1](out)
-            self.assertPreciseEqual(np.nanvar(arr), out.copy_to_host()[0])
+            self.assertPreciseEqual(
+                np.nanvar(arr), out.copy_to_host()[0], prec="double"
+            )
 
         arrays = [
             np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]),
diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py
index 6133cd25f..a492f9c48 100644
--- a/numba_cuda/numba/cuda/ufuncs.py
+++ b/numba_cuda/numba/cuda/ufuncs.py
@@ -726,22 +726,21 @@ def np_real_atanh_impl(context, builder, sig, args):
         "D->D": numbers.complex_conjugate_impl,
     }
 
-    if np.divide != np.true_divide:
-        db[np.divide] = {
-            "bb->b": npyfuncs.np_int_sdiv_impl,
-            "BB->B": npyfuncs.np_int_udiv_impl,
-            "hh->h": npyfuncs.np_int_sdiv_impl,
-            "HH->H": npyfuncs.np_int_udiv_impl,
-            "ii->i": npyfuncs.np_int_sdiv_impl,
-            "II->I": npyfuncs.np_int_udiv_impl,
-            "ll->l": npyfuncs.np_int_sdiv_impl,
-            "LL->L": npyfuncs.np_int_udiv_impl,
-            "qq->q": npyfuncs.np_int_sdiv_impl,
-            "QQ->Q": npyfuncs.np_int_udiv_impl,
-            "ff->f": npyfuncs.np_real_div_impl,
-            "dd->d": npyfuncs.np_real_div_impl,
-            "FF->F": npyfuncs.np_complex_div_impl,
-            "DD->D": npyfuncs.np_complex_div_impl,
-        }
+    db[np.divide] = {
+        "bb->b": npyfuncs.np_int_sdiv_impl,
+        "BB->B": npyfuncs.np_int_udiv_impl,
+        "hh->h": npyfuncs.np_int_sdiv_impl,
+        "HH->H": npyfuncs.np_int_udiv_impl,
+        "ii->i": npyfuncs.np_int_sdiv_impl,
+        "II->I": npyfuncs.np_int_udiv_impl,
+        "ll->l": npyfuncs.np_int_sdiv_impl,
+        "LL->L": npyfuncs.np_int_udiv_impl,
+        "qq->q": npyfuncs.np_int_sdiv_impl,
+        "QQ->Q": npyfuncs.np_int_udiv_impl,
+        "ff->f": npyfuncs.np_real_div_impl,
+        "dd->d": npyfuncs.np_real_div_impl,
+        "FF->F": npyfuncs.np_complex_div_impl,
+        "DD->D": npyfuncs.np_complex_div_impl,
+    }
 
     return db

From 996922348c2f415ef3ee450d1c0e5c4bb1cc1147 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 16 Oct 2025 06:30:42 -0700
Subject: [PATCH 09/21] faster?

---
 .../numba/cuda/tests/test_array_reductions.py | 54 ++++++++++++-------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index d484cf5c3..44dc671a0 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -23,27 +23,41 @@ def tearDown(self):
         super(TestArrayReductions, self).tearDown()
 
     def test_all_basic(self):
-        def check(arr):
-            @cuda.jit
-            def kernel(out):
-                gid = cuda.grid(1)
-                if gid < 1:
-                    out[0] = np.all(arr)
-
-            out = cuda.to_device(np.zeros(1, dtype=np.bool_))
-            kernel[1, 1](out)
-            self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0])
+        cases = [
+            np.float64([1.0, 0.0, float("inf"), float("nan")]),
+            np.float64([1.0, -0.0, float("inf"), float("nan")]),
+            np.float64([1.0, 1.5, float("inf"), float("nan")]),
+            np.float64([[1.0, 1.5], [float("inf"), float("nan")]]),
+            np.float64([[1.0, 1.5], [1.5, 1.0]]),
+        ]
 
-        arr = np.float64([1.0, 0.0, float("inf"), float("nan")])
-        check(arr)
-        arr = np.float64([1.0, -0.0, float("inf"), float("nan")])
-        check(arr)
-        arr = np.float64([1.0, 1.5, float("inf"), float("nan")])
-        check(arr)
-        arr = np.float64([[1.0, 1.5], [float("inf"), float("nan")]])
-        check(arr)
-        arr = np.float64([[1.0, 1.5], [1.5, 1.0]])
-        check(arr)
+        case_0 = cases[0]
+        case_1 = cases[1]
+        case_2 = cases[2]
+        case_3 = cases[3]
+        case_4 = cases[4]
+
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid == 0:
+                ans = np.all(case_0)
+            if gid == 1:
+                ans = np.all(case_1)
+            if gid == 2:
+                ans = np.all(case_2)
+            if gid == 3:
+                ans = np.all(case_3)
+            if gid == 4:
+                ans = np.all(case_4)
+            out[gid] = ans
+
+        expected = np.array([np.all(a) for a in cases], dtype=np.bool_)
+        out = cuda.to_device(np.zeros(len(cases), dtype=np.bool_))
+        kernel[1, len(cases)](out)
+        got = out.copy_to_host()
+
+        self.assertPreciseEqual(expected, got)
 
     def test_any_basic(self):
         def check(arr):

From 398489b420546af394b935b6bed857ecb87ff43e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 22 Oct 2025 13:13:27 -0500
Subject: [PATCH 10/21] Apply suggestions from code review

Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
---
 numba_cuda/numba/cuda/cpython/listobj.py       | 2 +-
 numba_cuda/numba/cuda/memory_management/nrt.cu | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/numba_cuda/numba/cuda/cpython/listobj.py b/numba_cuda/numba/cuda/cpython/listobj.py
index 08bc954b1..64bd749f7 100644
--- a/numba_cuda/numba/cuda/cpython/listobj.py
+++ b/numba_cuda/numba/cuda/cpython/listobj.py
@@ -304,7 +304,7 @@ def define_dtor(self):
         else:
             dtypestr = str(self.dtype)
         fn = cgutils.get_or_insert_function(
-            mod, fnty, "numba_cuda_dtor_list_{}".format(dtypestr)
+            mod, fnty, f"numba_cuda_dtor_list_{dtypestr}"
         )
         if not fn.is_declaration:
             # End early if the dtor is already defined
diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cu b/numba_cuda/numba/cuda/memory_management/nrt.cu
index 25da96847..467e057e3 100644
--- a/numba_cuda/numba/cuda/memory_management/nrt.cu
+++ b/numba_cuda/numba/cuda/memory_management/nrt.cu
@@ -91,13 +91,12 @@ nrt_varsize_dtor(void *ptr, size_t size, void *info) {
 
 __device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size)
 {
-    NRT_MemInfo *mi = NULL;
     void *data = NRT_Allocate(size);
     if (data == NULL) {
         return NULL; /* return early as allocation failed */
     }
 
-    mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL);
+    NRT_MemInfo *mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL);
 
     return mi;
 }

From b58d7573a9030df1d3a112a8d39bc86904b14d9e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 22 Oct 2025 11:41:16 -0700
Subject: [PATCH 11/21] export

---
 numba_cuda/numba/cuda/memory_management/nrt.cu  | 1 +
 numba_cuda/numba/cuda/memory_management/nrt.cuh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cu b/numba_cuda/numba/cuda/memory_management/nrt.cu
index 467e057e3..8ad9828db 100644
--- a/numba_cuda/numba/cuda/memory_management/nrt.cu
+++ b/numba_cuda/numba/cuda/memory_management/nrt.cu
@@ -89,6 +89,7 @@ nrt_varsize_dtor(void *ptr, size_t size, void *info) {
     NRT_Free(ptr);
 }
 
+extern "C"
 __device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size)
 {
     void *data = NRT_Allocate(size);
diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cuh b/numba_cuda/numba/cuda/memory_management/nrt.cuh
index ab7ae5ef7..2dbfa06da 100644
--- a/numba_cuda/numba/cuda/memory_management/nrt.cuh
+++ b/numba_cuda/numba/cuda/memory_management/nrt.cuh
@@ -44,4 +44,5 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
                                             size_t size,
                                             NRT_dtor_function dtor,
                                             void* dtor_info);
+extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size);
 extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor);

From a7e24cccd2d908671a8588b6d16fc5e430f81ece Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 23 Oct 2025 06:08:14 -0700
Subject: [PATCH 12/21] basic

---
 .../numba/cuda/tests/test_array_reductions.py | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index 44dc671a0..07ccafb64 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -7,6 +7,10 @@
 from numba.cuda import config
 
 
+def array_median_global(arr):
+    return np.median(arr)
+
+
 class TestArrayReductions(MemoryLeakMixin, TestCase):
     """
     Test array reduction methods and functions such as .sum(), .max(), etc.
@@ -436,3 +440,59 @@ def kernel(out):
         ]
         for arr in arrays:
             check(arr)
+
+    def test_median_basic(self):
+        def variations(a):
+            # Sorted, reversed, random, many duplicates
+            yield a
+            a = a[::-1].copy()
+            yield a
+            np.random.shuffle(a)
+            yield a
+            a[a % 4 >= 1] = 3.5
+            yield a
+
+        self.check_median_basic(array_median_global, variations)
+
+    def check_median_basic(self, pyfunc, array_variations):
+        # cfunc = jit(nopython=True)(pyfunc)
+
+        def check(arr):
+            @cuda.jit
+            def kernel(out):
+                gid = cuda.grid(1)
+                if gid < 1:
+                    print(np.median(arr))
+                    print(arr[2])
+                    out[0] = np.median(arr)
+
+            expected = pyfunc(arr)
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+
+            got = out.copy_to_host()[0]
+
+            self.assertPreciseEqual(expected, got)
+
+        # Empty array case
+        # check(np.array([]))
+
+        # Odd sizes
+        def check_odd(a):
+            check(a)
+            a = a.reshape((9, 7))
+            check(a)
+            check(a.T)
+
+        for a in array_variations(np.arange(63) + 10.5):
+            check_odd(a)
+
+        # Even sizes
+        def check_even(a):
+            check(a)
+            a = a.reshape((4, 16))
+            check(a)
+            check(a.T)
+
+        for a in array_variations(np.arange(64) + 10.5):
+            check_even(a)

From 14026e81929713eb8a8754489f42d5bdbeab2275 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 23 Oct 2025 06:17:12 -0700
Subject: [PATCH 13/21] add test that exhibits the failure

---
 .../numba/cuda/tests/test_array_methods.py    | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 numba_cuda/numba/cuda/tests/test_array_methods.py

diff --git a/numba_cuda/numba/cuda/tests/test_array_methods.py b/numba_cuda/numba/cuda/tests/test_array_methods.py
new file mode 100644
index 000000000..407a44792
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_array_methods.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+import numpy as np
+
+from numba.tests.support import TestCase, MemoryLeakMixin
+from numba import cuda
+from numba.cuda import config
+
+
+class TestArrayMethods(MemoryLeakMixin, TestCase):
+    """
+    Test array reduction methods and functions such as .sum(), .max(), etc.
+    """
+
+    def setUp(self):
+        super(TestArrayMethods, self).setUp()
+        np.random.seed(42)
+        self.old_nrt_setting = config.CUDA_ENABLE_NRT
+        config.CUDA_ENABLE_NRT = True
+
+    def tearDown(self):
+        config.CUDA_ENABLE_NRT = self.old_nrt_setting
+        super(TestArrayMethods, self).tearDown()
+
+    def test_array_copy(self):
+        ary = np.array([1.0, 2.0, 3.0])
+        out = cuda.to_device(np.zeros(3))
+
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid < 1:
+                cpy = ary.copy()
+                for i in range(len(out)):
+                    out[i] = cpy[i]
+
+        kernel[1, 1](out)
+
+        result = out.copy_to_host()
+        np.testing.assert_array_equal(result, ary)

From 10f21fd6e4b73bbd89a026008d3c563129897712 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 23 Oct 2025 06:17:51 -0700
Subject: [PATCH 14/21] implement fix

---
 numba_cuda/numba/cuda/np/arrayobj.py | 52 ++++++++++++----------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py
index 3e06d84cd..76703ef50 100644
--- a/numba_cuda/numba/cuda/np/arrayobj.py
+++ b/numba_cuda/numba/cuda/np/arrayobj.py
@@ -5455,37 +5455,31 @@ def _array_copy(context, builder, sig, args):
     dest_data = ret.data
 
     assert rettype.layout in "CF"
-    if arytype.layout == rettype.layout:
-        # Fast path: memcpy
-        cgutils.raw_memcpy(
-            builder, dest_data, src_data, ary.nitems, ary.itemsize, align=1
-        )
 
-    else:
-        src_strides = cgutils.unpack_tuple(builder, ary.strides)
-        dest_strides = cgutils.unpack_tuple(builder, ret.strides)
-        intp_t = context.get_value_type(types.intp)
+    src_strides = cgutils.unpack_tuple(builder, ary.strides)
+    dest_strides = cgutils.unpack_tuple(builder, ret.strides)
+    intp_t = context.get_value_type(types.intp)
 
-        with cgutils.loop_nest(builder, shapes, intp_t) as indices:
-            src_ptr = cgutils.get_item_pointer2(
-                context,
-                builder,
-                src_data,
-                shapes,
-                src_strides,
-                arytype.layout,
-                indices,
-            )
-            dest_ptr = cgutils.get_item_pointer2(
-                context,
-                builder,
-                dest_data,
-                shapes,
-                dest_strides,
-                rettype.layout,
-                indices,
-            )
-            builder.store(builder.load(src_ptr), dest_ptr)
+    with cgutils.loop_nest(builder, shapes, intp_t) as indices:
+        src_ptr = cgutils.get_item_pointer2(
+            context,
+            builder,
+            src_data,
+            shapes,
+            src_strides,
+            arytype.layout,
+            indices,
+        )
+        dest_ptr = cgutils.get_item_pointer2(
+            context,
+            builder,
+            dest_data,
+            shapes,
+            dest_strides,
+            rettype.layout,
+            indices,
+        )
+        builder.store(builder.load(src_ptr), dest_ptr)
 
     return impl_ret_new_ref(context, builder, sig.return_type, ret._getvalue())
 

From 054041fa018854d59bb011d328d31407196bf176 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 27 Oct 2025 04:55:58 -0700
Subject: [PATCH 15/21] updates

---
 numba_cuda/numba/cuda/tests/test_array_reductions.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py
index 07ccafb64..f83e43f0c 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py
@@ -460,11 +460,7 @@ def check_median_basic(self, pyfunc, array_variations):
         def check(arr):
             @cuda.jit
             def kernel(out):
-                gid = cuda.grid(1)
-                if gid < 1:
-                    print(np.median(arr))
-                    print(arr[2])
-                    out[0] = np.median(arr)
+                out[0] = np.median(arr)
 
             expected = pyfunc(arr)
             out = cuda.to_device(np.zeros(1, dtype=np.float64))

From 0595cb9439f555603a374c13cfa79fb9a34ac574 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 31 Oct 2025 08:13:32 -0700
Subject: [PATCH 16/21] move files, add NRTEnablingCUDATestCase

---
 numba_cuda/numba/cuda/testing.py              | 11 ++++++++++
 .../cuda/tests/cudapy/test_array_methods.py   | 21 +++++++++++++++++--
 .../{ => cudapy}/test_array_reductions.py     |  5 +++--
 3 files changed, 33 insertions(+), 4 deletions(-)
 rename numba_cuda/numba/cuda/tests/{ => cudapy}/test_array_reductions.py (98%)

diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py
index 196d08897..4ed672828 100644
--- a/numba_cuda/numba/cuda/testing.py
+++ b/numba_cuda/numba/cuda/testing.py
@@ -184,6 +184,17 @@ def assertFileCheckMatches(
             )
 
 
+class NRTEnablingCUDATestCase(CUDATestCase):
+    def setUp(self):
+        self.old_nrt_setting = config.CUDA_ENABLE_NRT
+        config.CUDA_ENABLE_NRT = True
+        super().setUp()
+
+    def tearDown(self):
+        config.CUDA_ENABLE_NRT = self.old_nrt_setting
+        super().tearDown()
+
+
 def skip_on_cudasim(reason):
     """Skip this test if running on the CUDA simulator"""
     return unittest.skipIf(config.ENABLE_CUDASIM, reason)
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
index 9b5873125..d8ab29d79 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from numba import cuda
-from numba.cuda.testing import CUDATestCase
+from numba.cuda.testing import NRTEnablingCUDATestCase
 import unittest
 
 
@@ -13,7 +13,7 @@ def reinterpret_array_type(byte_arr, start, stop, output):
     output[0] = val
 
 
-class TestCudaArrayMethods(CUDATestCase):
+class TestCudaArrayMethods(NRTEnablingCUDATestCase):
     def test_reinterpret_array_type(self):
         """
         Reinterpret byte array as int32 in the GPU.
@@ -33,6 +33,23 @@ def test_reinterpret_array_type(self):
             got = output[0]
             self.assertEqual(expect, got)
 
+    def test_array_copy(self):
+        ary = np.array([1.0, 2.0, 3.0])
+        out = cuda.to_device(np.zeros(3))
+
+        @cuda.jit
+        def kernel(out):
+            gid = cuda.grid(1)
+            if gid < 1:
+                cpy = ary.copy()
+                for i in range(len(out)):
+                    out[i] = cpy[i]
+
+        kernel[1, 1](out)
+
+        result = out.copy_to_host()
+        np.testing.assert_array_equal(result, ary)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
similarity index 98%
rename from numba_cuda/numba/cuda/tests/test_array_reductions.py
rename to numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
index 7fc483d98..9d8f02618 100644
--- a/numba_cuda/numba/cuda/tests/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
@@ -2,7 +2,8 @@
 # SPDX-License-Identifier: BSD-2-Clause
 import numpy as np
 
-from numba.tests.support import TestCase, MemoryLeakMixin
+from numba.tests.support import MemoryLeakMixin
+from numba.cuda.testing import NRTEnablingCUDATestCase
 from numba import cuda
 
 
@@ -14,7 +15,7 @@ def array_median_global(arr):
     return np.median(arr)
 
 
-class TestArrayReductions(MemoryLeakMixin, TestCase):
+class TestArrayReductions(MemoryLeakMixin, NRTEnablingCUDATestCase):
     """
     Test array reduction methods and functions such as .sum(), .max(), etc.
     """

From 108df98e0a5b592c0aa3e43eac359a69990b7f3a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 31 Oct 2025 08:58:02 -0700
Subject: [PATCH 17/21] cleanup

---
 numba_cuda/numba/cuda/target.py         | 4 ++--
 numba_cuda/numba/cuda/typing/context.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 84f5fcb9b..6faf15571 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -36,7 +36,7 @@
 # Typing
 
 
-class CUDATypingContext(typing.Context):
+class CUDATypingContext(typing.BaseContext):
     def load_additional_registries(self):
         from . import (
             cudadecl,
@@ -58,7 +58,6 @@ def load_additional_registries(self):
         self.install_registry(vector_types.typing_registry)
         self.install_registry(fp16.typing_registry)
         self.install_registry(bf16.typing_registry)
-        super().load_additional_registries()
 
     def resolve_value_type(self, val):
         # treat other dispatcher object as another device function
@@ -182,6 +181,7 @@ def load_additional_registries(self):
         from numba.cuda.core import optional  # noqa: F401
         from numba.cuda.misc import cffiimpl
         from numba.cuda.np import arrayobj, npdatetime, polynomial, arraymath
+
         from . import (
             cudaimpl,
             fp16,
diff --git a/numba_cuda/numba/cuda/typing/context.py b/numba_cuda/numba/cuda/typing/context.py
index fbb471004..6c2c933bf 100644
--- a/numba_cuda/numba/cuda/typing/context.py
+++ b/numba_cuda/numba/cuda/typing/context.py
@@ -491,7 +491,7 @@ def is_external(obj):
             else:
                 # A type was already inserted, see if we can add to it
                 newty = existing.augment(gty)
-                if newty is None and existing != gty:
+                if newty is None:
                     raise TypeError(
                         "cannot augment %s with %s" % (existing, gty)
                     )

From dd1de26056983e9a54e12e8a56d784a033d36e2a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 31 Oct 2025 11:14:17 -0700
Subject: [PATCH 18/21] more tests, buggy copy exposed again

---
 .../tests/cudapy/test_array_reductions.py     | 144 +++++++++++++++++-
 numba_cuda/numba/cuda/ufuncs.py               |  45 ++++++
 2 files changed, 187 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
index 9d8f02618..3698ff002 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
@@ -6,7 +6,7 @@
 from numba.cuda.testing import NRTEnablingCUDATestCase
 from numba import cuda
 
-
+from itertools import combinations_with_replacement
 from numba.cuda.misc.special import literal_unroll
 from numba.cuda import config
 
@@ -391,7 +391,7 @@ def kernel(out):
             self.assertPreciseEqual(expected, got)
 
         # Empty array case
-        # check(np.array([]))
+        check(np.array([]))
 
         # Odd sizes
         def check_odd(a):
@@ -412,3 +412,143 @@ def check_even(a):
 
         for a in array_variations(np.arange(64) + 10.5):
             check_even(a)
+
+    def check_percentile(self, pyfunc, q_upper_bound):
+        def check(a, q, abs_tol=1e-12):
+            @cuda.jit
+            def kernel(out):
+                result = np.percentile(a, q)
+                for i in range(len(out)):
+                    out[i] = result[i]
+
+            out = cuda.to_device(np.zeros(len(q), dtype=np.float64))
+            kernel[1, 1](out)
+
+            expected = np.percentile(a, q)
+            got = out.copy_to_host()
+
+            finite = np.isfinite(expected)
+            if np.all(finite):
+                self.assertPreciseEqual(got, expected, abs_tol=abs_tol)
+            else:
+                self.assertPreciseEqual(
+                    got[finite], expected[finite], abs_tol=abs_tol
+                )
+
+        a = self.random.randn(27).reshape(3, 3, 3)
+        q = np.linspace(0, q_upper_bound, 14)[::-1]
+
+        check(a, q)
+        check(a, 0)
+        check(a, q_upper_bound / 2)
+        check(a, q_upper_bound)
+
+        not_finite = [np.nan, -np.inf, np.inf]
+        a.flat[:10] = self.random.choice(not_finite, 10)
+        self.random.shuffle(a)
+        self.random.shuffle(q)
+        check(a, q)
+
+        a = a.flatten().tolist()
+        q = q.flatten().tolist()
+        check(a, q)
+        check(tuple(a), tuple(q))
+
+        a = self.random.choice([1, 2, 3, 4], 10)
+        q = np.linspace(0, q_upper_bound, 5)
+        check(a, q)
+
+        # tests inspired by
+        # https://github.com/numpy/numpy/blob/345b2f6e/numpy/lib/tests/test_function_base.py
+        x = np.arange(8) * 0.5
+        np.testing.assert_equal(np.percentile(x, 0), 0.0)
+        np.testing.assert_equal(np.percentile(x, q_upper_bound), 3.5)
+        np.testing.assert_equal(np.percentile(x, q_upper_bound / 2), 1.75)
+
+        x = np.arange(12).reshape(3, 4)
+        q = np.array((0.25, 0.5, 1.0)) * q_upper_bound
+        np.testing.assert_equal(np.percentile(x, q), [2.75, 5.5, 11.0])
+
+        x = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
+        q = np.array((0.25, 0.50)) * q_upper_bound
+        np.testing.assert_equal(np.percentile(x, q).shape, (2,))
+
+        q = np.array((0.25, 0.50, 0.75)) * q_upper_bound
+        np.testing.assert_equal(np.percentile(x, q).shape, (3,))
+
+        x = np.arange(12).reshape(3, 4)
+        np.testing.assert_equal(np.percentile(x, q_upper_bound / 2), 5.5)
+        self.assertTrue(np.isscalar(np.percentile(x, q_upper_bound / 2)))
+
+        np.testing.assert_equal(np.percentile([1, 2, 3], 0), 1)
+
+        a = np.array([2, 3, 4, 1])
+        np.percentile(a, [q_upper_bound / 2])
+        np.testing.assert_equal(a, np.array([2, 3, 4, 1]))
+
+    def test_percentile_basic(self):
+        pyfunc = np.percentile
+        self.check_percentile(pyfunc, q_upper_bound=100)
+        # self.check_percentile_edge_cases(pyfunc, q_upper_bound=100)
+        # self.check_percentile_exceptions(pyfunc)
+
+    def check_percentile_edge_cases(self, pyfunc, q_upper_bound=100):
+        def check(a, q, abs_tol=1e-14):
+            @cuda.jit
+            def kernel(out):
+                result = np.percentile(a, q)
+                for i in range(len(out)):
+                    out[i] = result[i]
+
+            out = cuda.to_device(np.zeros(len(q), dtype=np.float64))
+            kernel[1, 1](out)
+            expected = np.percentile(a, q)
+
+            got = out.copy_to_host()
+            finite = np.isfinite(expected)
+
+            if np.all(finite):
+                self.assertPreciseEqual(got, expected, abs_tol=abs_tol)
+            else:
+                self.assertPreciseEqual(
+                    got[finite], expected[finite], abs_tol=abs_tol
+                )
+
+        def convert_to_float_and_check(a, q, abs_tol=1e-14):
+            expected = pyfunc(a, q).astype(np.float64)
+            got = np.percentile(a, q)
+            self.assertPreciseEqual(got, expected, abs_tol=abs_tol)
+
+        def _array_combinations(elements):
+            for i in range(1, 10):
+                for comb in combinations_with_replacement(elements, i):
+                    yield np.array(comb)
+
+        # high number of combinations, many including non-finite values
+        q = (0, 0.1 * q_upper_bound, 0.2 * q_upper_bound, q_upper_bound)
+        element_pool = (1, -1, np.nan, np.inf, -np.inf)
+        for a in _array_combinations(element_pool):
+            check(a, q)
+
+        # edge cases - numpy exhibits behavioural differences across
+        # platforms, see: https://github.com/numpy/numpy/issues/13272
+        if q_upper_bound == 1:
+            _check = convert_to_float_and_check
+        else:
+            _check = check
+
+        a = np.array(5)
+        q = np.array(1)
+        _check(a, q)
+
+        a = 5
+        q = q_upper_bound / 2
+        _check(a, q)
+
+    def check_percentile_exceptions(self, pyfunc):
+        # TODO
+        pass
+
+    def check_quantile_exceptions(self, pyfunc):
+        # TODO
+        pass
diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py
index dc86a455b..9c1728225 100644
--- a/numba_cuda/numba/cuda/ufuncs.py
+++ b/numba_cuda/numba/cuda/ufuncs.py
@@ -743,4 +743,49 @@ def np_real_atanh_impl(context, builder, sig, args):
         "DD->D": npyfuncs.np_complex_div_impl,
     }
 
+    db[np.isfinite] = {
+        "f->?": npyfuncs.np_real_isfinite_impl,
+        "d->?": npyfuncs.np_real_isfinite_impl,
+        "F->?": npyfuncs.np_complex_isfinite_impl,
+        "D->?": npyfuncs.np_complex_isfinite_impl,
+        # int8
+        "b->?": npyfuncs.np_int_isfinite_impl,
+        "B->?": npyfuncs.np_int_isfinite_impl,
+        # int16
+        "h->?": npyfuncs.np_int_isfinite_impl,
+        "H->?": npyfuncs.np_int_isfinite_impl,
+        # int32
+        "i->?": npyfuncs.np_int_isfinite_impl,
+        "I->?": npyfuncs.np_int_isfinite_impl,
+        # int64
+        "l->?": npyfuncs.np_int_isfinite_impl,
+        "L->?": npyfuncs.np_int_isfinite_impl,
+        # intp
+        "q->?": npyfuncs.np_int_isfinite_impl,
+        "Q->?": npyfuncs.np_int_isfinite_impl,
+        # boolean
+        "?->?": npyfuncs.np_int_isfinite_impl,
+        # datetime & timedelta
+        "M->?": npyfuncs.np_datetime_isfinite_impl,
+        "m->?": npyfuncs.np_datetime_isfinite_impl,
+    }
+
+    db[np.multiply] = {
+        "??->?": numbers.int_and_impl,
+        "bb->b": numbers.int_mul_impl,
+        "BB->B": numbers.int_mul_impl,
+        "hh->h": numbers.int_mul_impl,
+        "HH->H": numbers.int_mul_impl,
+        "ii->i": numbers.int_mul_impl,
+        "II->I": numbers.int_mul_impl,
+        "ll->l": numbers.int_mul_impl,
+        "LL->L": numbers.int_mul_impl,
+        "qq->q": numbers.int_mul_impl,
+        "QQ->Q": numbers.int_mul_impl,
+        "ff->f": numbers.real_mul_impl,
+        "dd->d": numbers.real_mul_impl,
+        "FF->F": numbers.complex_mul_impl,
+        "DD->D": numbers.complex_mul_impl,
+    }
+
     return db

From 5aa441ac63f015fc92a54b29ee3251c3eba108e3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 4 Nov 2025 04:49:20 -0800
Subject: [PATCH 19/21] partial list impl

---
 numba_cuda/numba/cuda/np/arrayobj.py          |  6 ++++
 numba_cuda/numba/cuda/target.py               | 28 ++++++++++++++-
 .../tests/cudapy/test_array_reductions.py     | 34 +++++++++++++------
 3 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py
index bd059250b..19901fcce 100644
--- a/numba_cuda/numba/cuda/np/arrayobj.py
+++ b/numba_cuda/numba/cuda/np/arrayobj.py
@@ -3646,6 +3646,12 @@ def constant_array(context, builder, ty, pyval):
     """
     return context.make_constant_array(builder, ty, pyval)
 
+@lower_constant(types.List)
+def constant_list(context, builder, ty, pyval):
+    """
+    Create a constant list (mechanism is target-dependent).
+    """
+    return context.make_constant_list(builder, ty, pyval)
 
 @lower_constant(types.Record)
 def constant_record(context, builder, ty, pyval):
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 6faf15571..1c0ad582a 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -278,12 +278,38 @@ def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
             name, argtypes, abi_tags=abi_tags, uid=uid
         )
 
+    def make_constant_list(self, builder, listty, lst):
+        import numpy as np
+        constvals = [
+            self.get_constant(listty.dtype, i)
+            for i in iter(np.array(lst))
+        ]
+        instance = self.build_list(builder, listty, constvals)
+        # create constant address space version of the list
+        lmod = builder.module
+
+        constlistty = instance.type
+        constlist = ir.Constant(constlistty, instance)
+        addrspace = nvvm.ADDRSPACE_CONSTANT
+        gv = cgutils.add_global_variable(
+            lmod, constlist.type, "_cudapy_clist", addrspace=addrspace
+        )
+        gv.linkage = "internal"
+        gv.global_constant = True
+        gv.initializer = constlist
+
+        # Convert to generic address-space
+        ptrty = ir.PointerType(constlistty)
+        genptr = builder.addrspacecast(gv, ptrty, "generic")
+        lst = cgutils.create_struct_proxy(listty)(self, builder, value=builder.load(genptr))
+        return lst._getvalue()
+
+
     def make_constant_array(self, builder, aryty, arr):
         """
         Unlike the parent version.  This returns a a pointer in the constant
         addrspace.
         """
-
         lmod = builder.module
 
         constvals = [
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
index 3698ff002..3be3f494a 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
@@ -414,7 +414,7 @@ def check_even(a):
             check_even(a)
 
     def check_percentile(self, pyfunc, q_upper_bound):
-        def check(a, q, abs_tol=1e-12):
+        def check_array_q(a, q, abs_tol=1e-12):
             @cuda.jit
             def kernel(out):
                 result = np.percentile(a, q)
@@ -435,28 +435,42 @@ def kernel(out):
                     got[finite], expected[finite], abs_tol=abs_tol
                 )
 
+        def check_scalar_q(a, q, abs_tol=1e-12):
+            @cuda.jit
+            def kernel(out):
+                out[0] = np.percentile(a, q)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            kernel[1, 1](out)
+
+            expected = np.percentile(a, q)
+            got = out.copy_to_host()[0]
+
+            if np.isfinite(expected):
+                self.assertPreciseEqual(got, expected, abs_tol=abs_tol)
+
         a = self.random.randn(27).reshape(3, 3, 3)
-        q = np.linspace(0, q_upper_bound, 14)[::-1]
+        q = np.linspace(0, q_upper_bound, 14)[::-1].copy()
 
-        check(a, q)
-        check(a, 0)
-        check(a, q_upper_bound / 2)
-        check(a, q_upper_bound)
+        check_array_q(a, q)
+        check_scalar_q(a, 0)
+        check_scalar_q(a, q_upper_bound / 2)
+        check_scalar_q(a, q_upper_bound)
 
         not_finite = [np.nan, -np.inf, np.inf]
         a.flat[:10] = self.random.choice(not_finite, 10)
         self.random.shuffle(a)
         self.random.shuffle(q)
-        check(a, q)
+        check_array_q(a, q)
 
         a = a.flatten().tolist()
         q = q.flatten().tolist()
-        check(a, q)
-        check(tuple(a), tuple(q))
+        check_array_q(a, q)
+        #check(tuple(a), tuple(q))
 
         a = self.random.choice([1, 2, 3, 4], 10)
         q = np.linspace(0, q_upper_bound, 5)
-        check(a, q)
+        check_array_q(a, q)
 
         # tests inspired by
         # https://github.com/numpy/numpy/blob/345b2f6e/numpy/lib/tests/test_function_base.py

From 5ab95bc6c4689d1cdfd038c2270610f87a08bef8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 4 Nov 2025 18:45:40 -0800
Subject: [PATCH 20/21] more tests

---
 .../tests/cudapy/test_array_reductions.py     | 101 ++++++++++++++++--
 1 file changed, 91 insertions(+), 10 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
index 3cfe0c5a4..23d4e006f 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
@@ -10,6 +10,7 @@
 from numba.cuda.testing import skip_on_cudasim
 from numba.cuda.misc.special import literal_unroll
 from numba.cuda import config
+import unittest
 
 
 def array_median_global(arr):
@@ -422,14 +423,14 @@ def check_percentile(self, pyfunc, q_upper_bound):
         def check_array_q(a, q, abs_tol=1e-12):
             @cuda.jit
             def kernel(out):
-                result = np.percentile(a, q)
+                result = pyfunc(a, q)
                 for i in range(len(out)):
                     out[i] = result[i]
 
             out = cuda.to_device(np.zeros(len(q), dtype=np.float64))
             kernel[1, 1](out)
 
-            expected = np.percentile(a, q)
+            expected = pyfunc(a, q)
             got = out.copy_to_host()
 
             finite = np.isfinite(expected)
@@ -443,12 +444,12 @@ def kernel(out):
         def check_scalar_q(a, q, abs_tol=1e-12):
             @cuda.jit
             def kernel(out):
-                out[0] = np.percentile(a, q)
+                out[0] = pyfunc(a, q)
 
             out = cuda.to_device(np.zeros(1, dtype=np.float64))
             kernel[1, 1](out)
 
-            expected = np.percentile(a, q)
+            expected = pyfunc(a, q)
             got = out.copy_to_host()[0]
 
             if np.isfinite(expected):
@@ -481,9 +482,13 @@ def kernel(out):
 
     def test_percentile_basic(self):
         pyfunc = np.percentile
-        # self.check_percentile(pyfunc, q_upper_bound=100)
+        self.check_percentile(pyfunc, q_upper_bound=100)
         self.check_percentile_edge_cases(pyfunc, q_upper_bound=100)
-        # self.check_percentile_exceptions(pyfunc)
+
+    @unittest.expectedFailure
+    def test_percentile_exceptions(self):
+        pyfunc = np.percentile
+        self.check_percentile_exceptions(pyfunc)
 
     def check_percentile_edge_cases(self, pyfunc, q_upper_bound=100):
         # intended to be a faitful reproduction of the upstream numba test
@@ -541,9 +546,85 @@ def kernel(a_batch, lengths, q_arr, out):
                 )
 
     def check_percentile_exceptions(self, pyfunc):
-        # TODO
-        pass
+        def check_scalar_q_err(a, q, abs_tol=1e-12):
+            @cuda.jit
+            def kernel(out):
+                out[0] = np.percentile(a, q)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            with self.assertRaises(ValueError) as raises:
+                kernel[1, 1](out)
+            self.assertEqual(
+                "Percentiles must be in the range [0, 100]",
+                str(raises.exception),
+            )
+
+        # Exceptions leak references
+        self.disable_leak_check()
+        a = np.arange(5)
+        check_scalar_q_err(a, -5)  # q less than 0
+        check_scalar_q_err(a, 105)
+        check_scalar_q_err(a, np.nan)
+
+        # complex typing failure
+        @cuda.jit
+        def kernel(out):
+            np.percentile(a, q)
+
+        a = np.arange(5) * 1j
+        q = 0.1
+
+        out = cuda.to_device(np.zeros(1, dtype=np.float64))
+        with self.assertTypingError():
+            kernel[1, 1](out)
 
+    @unittest.expectedFailure
     def check_quantile_exceptions(self, pyfunc):
-        # TODO
-        pass
+        def check_scalar_q_err(a, q, abs_tol=1e-12):
+            @cuda.jit
+            def kernel(out):
+                out[0] = np.percentile(a, q)
+
+            out = cuda.to_device(np.zeros(1, dtype=np.float64))
+            with self.assertRaises(ValueError) as raises:
+                kernel[1, 1](out)
+            self.assertEqual(
+                "Quantiles must be in the range [0, 1]",
+                str(raises.exception),
+            )
+
+        # Exceptions leak references
+        self.disable_leak_check()
+        a = np.arange(5)
+        check_scalar_q_err(a, -0.5)  # q less than 0
+        check_scalar_q_err(a, 1.05)
+        check_scalar_q_err(a, np.nan)
+
+        # complex typing failure
+        @cuda.jit
+        def kernel(out):
+            np.quantile(a, q)
+
+        a = np.arange(5) * 1j
+        q = 0.1
+
+        out = cuda.to_device(np.zeros(1, dtype=np.float64))
+        with self.assertTypingError():
+            kernel[1, 1](out)
+
+    def test_quantile_basic(self):
+        pyfunc = np.quantile
+        self.check_percentile(pyfunc, q_upper_bound=1)
+        self.check_percentile_edge_cases(pyfunc, q_upper_bound=1)
+
+    def test_nanpercentile_basic(self):
+        pyfunc = np.nanpercentile
+        self.check_percentile(pyfunc, q_upper_bound=100)
+        self.check_percentile_edge_cases(pyfunc, q_upper_bound=100)
+        self.check_percentile_exceptions(pyfunc)
+
+    def test_nanquantile_basic(self):
+        pyfunc = np.nanquantile
+        self.check_percentile(pyfunc, q_upper_bound=1)
+        self.check_percentile_edge_cases(pyfunc, q_upper_bound=1)
+        self.check_quantile_exceptions(pyfunc)

From 12ef05804ce7a640ec41f16fef14e3c88897a400 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 12 Nov 2025 12:28:05 -0800
Subject: [PATCH 21/21] renaming

---
 .../tests/cudapy/test_array_reductions.py     | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
index 23d4e006f..60bd07ddc 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py
@@ -419,7 +419,7 @@ def check_even(a):
         for a in array_variations(np.arange(64) + 10.5):
             check_even(a)
 
-    def check_percentile(self, pyfunc, q_upper_bound):
+    def check_percentile_and_quantile(self, pyfunc, q_upper_bound):
         def check_array_q(a, q, abs_tol=1e-12):
             @cuda.jit
             def kernel(out):
@@ -482,15 +482,17 @@ def kernel(out):
 
     def test_percentile_basic(self):
         pyfunc = np.percentile
-        self.check_percentile(pyfunc, q_upper_bound=100)
-        self.check_percentile_edge_cases(pyfunc, q_upper_bound=100)
+        self.check_percentile_and_quantile(pyfunc, q_upper_bound=100)
+        self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=100)
 
     @unittest.expectedFailure
     def test_percentile_exceptions(self):
         pyfunc = np.percentile
-        self.check_percentile_exceptions(pyfunc)
+        self.check_percentile_and_quantile_exceptions(pyfunc)
 
-    def check_percentile_edge_cases(self, pyfunc, q_upper_bound=100):
+    def check_percentile_and_quantile_edge_cases(
+        self, pyfunc, q_upper_bound=100
+    ):
         # intended to be a faitful reproduction of the upstream numba test
         # packing all the test cases into a single kernel for perf
         def _array_combinations(elements):
@@ -545,7 +547,7 @@ def kernel(a_batch, lengths, q_arr, out):
                     got[i][finite], expected[finite], abs_tol=1e-14
                 )
 
-    def check_percentile_exceptions(self, pyfunc):
+    def check_percentile_and_quantile_exceptions(self, pyfunc):
         def check_scalar_q_err(a, q, abs_tol=1e-12):
             @cuda.jit
             def kernel(out):
@@ -614,17 +616,17 @@ def kernel(out):
 
     def test_quantile_basic(self):
         pyfunc = np.quantile
-        self.check_percentile(pyfunc, q_upper_bound=1)
-        self.check_percentile_edge_cases(pyfunc, q_upper_bound=1)
+        self.check_percentile_and_quantile(pyfunc, q_upper_bound=1)
+        self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=1)
 
     def test_nanpercentile_basic(self):
         pyfunc = np.nanpercentile
-        self.check_percentile(pyfunc, q_upper_bound=100)
-        self.check_percentile_edge_cases(pyfunc, q_upper_bound=100)
-        self.check_percentile_exceptions(pyfunc)
+        self.check_percentile_and_quantile(pyfunc, q_upper_bound=100)
+        self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=100)
+        self.check_percentile_and_quantile_exceptions(pyfunc)
 
     def test_nanquantile_basic(self):
         pyfunc = np.nanquantile
-        self.check_percentile(pyfunc, q_upper_bound=1)
-        self.check_percentile_edge_cases(pyfunc, q_upper_bound=1)
+        self.check_percentile_and_quantile(pyfunc, q_upper_bound=1)
+        self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=1)
         self.check_quantile_exceptions(pyfunc)