From beeadd1975f41c6b1fcc8fa6ad2a2a00aa92bb03 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 14 Oct 2025 08:26:43 -0700 Subject: [PATCH 01/21] basic test, bits and pieces of nrt needed --- numba_cuda/numba/cuda/cpython/listobj.py | 2 +- .../numba/cuda/memory_management/nrt.cu | 37 ++- .../numba/cuda/memory_management/nrt.cuh | 1 + numba_cuda/numba/cuda/target.py | 18 +- .../numba/cuda/tests/test_array_reductions.py | 297 ++++++++++++++++++ numba_cuda/numba/cuda/typing/context.py | 2 +- 6 files changed, 348 insertions(+), 9 deletions(-) create mode 100644 numba_cuda/numba/cuda/tests/test_array_reductions.py diff --git a/numba_cuda/numba/cuda/cpython/listobj.py b/numba_cuda/numba/cuda/cpython/listobj.py index bd1dbe1f9..7f7834048 100644 --- a/numba_cuda/numba/cuda/cpython/listobj.py +++ b/numba_cuda/numba/cuda/cpython/listobj.py @@ -300,7 +300,7 @@ def define_dtor(self): # Declare dtor fnty = ir.FunctionType(ir.VoidType(), [cgutils.voidptr_t]) fn = cgutils.get_or_insert_function( - mod, fnty, ".dtor.list.{}".format(self.dtype) + mod, fnty, "numba_cuda_dtor_list_{}".format(self.dtype) ) if not fn.is_declaration: # End early if the dtor is already defined diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cu b/numba_cuda/numba/cuda/memory_management/nrt.cu index 729c8d8c4..25da96847 100644 --- a/numba_cuda/numba/cuda/memory_management/nrt.cu +++ b/numba_cuda/numba/cuda/memory_management/nrt.cu @@ -78,6 +78,39 @@ extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi) NRT_MemInfo_destroy(mi); } +static void __device__ +nrt_varsize_dtor(void *ptr, size_t size, void *info) { + if (info) { + /* call element dtor */ + typedef void dtor_fn_t(void *ptr); + dtor_fn_t *dtor = (dtor_fn_t *)info; + dtor(ptr); + } + NRT_Free(ptr); +} + +__device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size) +{ + NRT_MemInfo *mi = NULL; + void *data = NRT_Allocate(size); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + + mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL); + + return mi; +} + +extern "C" +__device__ NRT_MemInfo* NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor) { + NRT_MemInfo *mi = NRT_MemInfo_new_varsize(size); + if (mi) { + mi->dtor_info = (void*)dtor; + } + return mi; +} + extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi) { return mi->data; @@ -167,7 +200,7 @@ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi) } -#endif + extern "C" __device__ void NRT_incref(NRT_MemInfo* mi) { @@ -175,3 +208,5 @@ extern "C" __device__ void NRT_incref(NRT_MemInfo* mi) mi->refct++; } } + +#endif diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cuh b/numba_cuda/numba/cuda/memory_management/nrt.cuh index b79dbc7fd..ab7ae5ef7 100644 --- a/numba_cuda/numba/cuda/memory_management/nrt.cuh +++ b/numba_cuda/numba/cuda/memory_management/nrt.cuh @@ -44,3 +44,4 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi, size_t size, NRT_dtor_function dtor, void* dtor_info); +extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor); diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index c2db1a442..819470e51 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -36,7 +36,7 @@ # Typing -class CUDATypingContext(typing.BaseContext): +class CUDATypingContext(typing.Context): def load_additional_registries(self): from . import ( cudadecl, @@ -57,6 +57,7 @@ def load_additional_registries(self): self.install_registry(vector_types.typing_registry) self.install_registry(fp16.typing_registry) self.install_registry(bf16.typing_registry) + super().load_additional_registries() def resolve_value_type(self, val): # treat other dispatcher object as another device function @@ -171,11 +172,7 @@ def load_additional_registries(self): from numba.cuda.cpython import builtins as cpython_builtins from numba.cuda.core import optional # noqa: F401 from numba.cuda.misc import cffiimpl - from numba.cuda.np import ( - arrayobj, - npdatetime, - polynomial, - ) + from numba.cuda.np import arrayobj, npdatetime, polynomial, arraymath from . import ( cudaimpl, fp16, @@ -215,6 +212,7 @@ def load_additional_registries(self): self.install_registry(polynomial.registry) self.install_registry(npdatetime.registry) self.install_registry(arrayobj.registry) + self.install_registry(arraymath.registry) def codegen(self): return self._internal_codegen @@ -225,6 +223,14 @@ def target_data(self): self._target_data = ll.create_target_data(nvvm.NVVM().data_layout) return self._target_data + def build_list(self, builder, list_type, items): + """ + Build a list from the Numba *list_type* and its initial *items*. + """ + from numba.cuda.cpython import listobj + + return listobj.build_list(self, builder, list_type, items) + @cached_property def nonconst_module_attrs(self): """ diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py new file mode 100644 index 000000000..41c8b471a --- /dev/null +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -0,0 +1,297 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause +import numpy as np + +from numba import jit, njit +from numba.tests.support import TestCase, MemoryLeakMixin +from numba import cuda + + +def array_all(arr): + return arr.all() + + +def array_all_global(arr): + return np.all(arr) + + +def array_any(arr): + return arr.any() + + +def array_any_global(arr): + return np.any(arr) + + +def array_cumprod(arr): + return arr.cumprod() + + +def array_cumprod_global(arr): + return np.cumprod(arr) + + +def array_nancumprod(arr): + return np.nancumprod(arr) + + +def array_cumsum(arr): + return arr.cumsum() + + +def array_cumsum_global(arr): + return np.cumsum(arr) + + +def array_nancumsum(arr): + return np.nancumsum(arr) + + +def array_sum(arr): + return arr.sum() + + +def array_sum_global(arr): + return np.sum(arr) + + +def array_prod(arr): + return arr.prod() + + +def array_prod_global(arr): + return np.prod(arr) + + +def array_mean(arr): + return arr.mean() + + +def array_mean_global(arr): + return np.mean(arr) + + +def array_var(arr): + return arr.var() + + +def array_var_global(arr): + return np.var(arr) + + +def array_std(arr): + return arr.std() + + +def array_std_global(arr): + return np.std(arr) + + +def array_min(arr): + return arr.min() + + +def array_min_global(arr): + return np.min(arr) + + +def array_amin(arr): + return np.amin(arr) + + +def array_max(arr): + return arr.max() + + +def array_max_global(arr): + return np.max(arr) + + +def array_amax(arr): + return np.amax(arr) + + +def array_argmin(arr): + return arr.argmin() + + +def array_argmin_global(arr): + return np.argmin(arr) + + +def array_argmax(arr): + return arr.argmax() + + +def array_argmax_global(arr): + return np.argmax(arr) + + +def array_median_global(arr): + return np.median(arr) + + +def array_nanmin(arr): + return np.nanmin(arr) + + +def array_nanmax(arr): + return np.nanmax(arr) + + +def array_nanmean(arr): + return np.nanmean(arr) + + +def array_nansum(arr): + return np.nansum(arr) + + +def array_nanprod(arr): + return np.nanprod(arr) + + +def array_nanstd(arr): + return np.nanstd(arr) + + +def array_nanvar(arr): + return np.nanvar(arr) + + +def array_nanmedian_global(arr): + return np.nanmedian(arr) + + +def array_percentile_global(arr, q): + return np.percentile(arr, q) + + +def array_nanpercentile_global(arr, q): + return np.nanpercentile(arr, q) + + +def array_ptp_global(a): + return np.ptp(a) + + +def array_ptp(a): + return a.ptp() + + +def array_quantile_global(arr, q): + return np.quantile(arr, q) + + +def array_nanquantile_global(arr, q): + return np.nanquantile(arr, q) + + +def base_test_arrays(dtype): + if dtype == np.bool_: + + def factory(n): + assert n % 2 == 0 + return np.bool_([0, 1] * (n // 2)) + else: + + def factory(n): + return np.arange(n, dtype=dtype) + 1 + + a1 = factory(10) + a2 = factory(10).reshape(2, 5) + # The prod() of this array fits in a 32-bit int + a3 = (factory(12))[::-1].reshape((2, 3, 2), order="A") + assert not (a3.flags.c_contiguous or a3.flags.f_contiguous) + + return [a1, a2, a3] + + +def full_test_arrays(dtype): + array_list = base_test_arrays(dtype) + + # Add floats with some mantissa + if dtype == np.float32: + array_list += [a / 10 for a in array_list] + + # add imaginary part + if dtype == np.complex64: + acc = [] + for a in array_list: + tmp = a / 10 + 1j * a / 11 + tmp[::2] = np.conj(tmp[::2]) + acc.append(tmp) + array_list.extend(acc) + + for a in array_list: + assert a.dtype == np.dtype(dtype) + return array_list + + +def run_comparative(compare_func, test_array): + cfunc = njit(compare_func) + numpy_result = compare_func(test_array) + numba_result = cfunc(test_array) + + return numpy_result, numba_result + + +class TestArrayReductions(MemoryLeakMixin, TestCase): + """ + Test array reduction methods and functions such as .sum(), .max(), etc. + """ + + def setUp(self): + super(TestArrayReductions, self).setUp() + np.random.seed(42) + + def check_reduction_basic(self, pyfunc, **kwargs): + # Basic reduction checks on 1-d float64 arrays + cfunc = jit(nopython=True)(pyfunc) + + def check(arr): + self.assertPreciseEqual(pyfunc(arr), cfunc(arr), **kwargs) + + arr = np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]) + check(arr) + arr = np.float64([-0.0, -1.5]) + check(arr) + arr = np.float64([-1.5, 2.5, "inf"]) + check(arr) + arr = np.float64([-1.5, 2.5, "-inf"]) + check(arr) + arr = np.float64([-1.5, 2.5, "inf", "-inf"]) + check(arr) + arr = np.float64(["nan", -1.5, 2.5, "nan", 3.0]) + check(arr) + arr = np.float64(["nan", -1.5, 2.5, "nan", "inf", "-inf", 3.0]) + check(arr) + arr = np.float64([5.0, "nan", -1.5, "nan"]) + check(arr) + # Only NaNs + arr = np.float64(["nan", "nan"]) + check(arr) + + def test_all_basic(self): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + ary = np.float64([1.0, 0.0, float("inf"), float("nan")]) + out[0] = np.all(ary) + + out = cuda.to_device(np.zeros(1, dtype=np.bool_)) + kernel[1, 1](out) + + arr = np.float64([1.0, 0.0, float("inf"), float("nan")]) + self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) + + # arr = np.float64([1.0, 0.0, float('inf'), float('nan')]) + # check(arr) + # arr[1] = -0.0 + # check(arr) + # arr[1] = 1.5 + # check(arr) + # arr = arr.reshape((2, 2)) + # check(arr) + # check(arr[::-1]) diff --git a/numba_cuda/numba/cuda/typing/context.py b/numba_cuda/numba/cuda/typing/context.py index e511de2cb..d278e7df2 100644 --- a/numba_cuda/numba/cuda/typing/context.py +++ b/numba_cuda/numba/cuda/typing/context.py @@ -474,7 +474,7 @@ def is_for_this_target(ftcls): else: # A type was already inserted, see if we can add to it newty = existing.augment(gty) - if newty is None: + if newty is None and existing != gty: raise TypeError( "cannot augment %s with %s" % (existing, gty) ) From 634e976e5c6adb9668d428d28ad390c94c1d1ee9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 14 Oct 2025 13:30:23 -0700 Subject: [PATCH 02/21] some progress --- numba_cuda/numba/cuda/cpython/listobj.py | 6 +- .../numba/cuda/tests/test_array_reductions.py | 60 ++++++++++++++++--- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/numba_cuda/numba/cuda/cpython/listobj.py b/numba_cuda/numba/cuda/cpython/listobj.py index 7f7834048..2a8c661c7 100644 --- a/numba_cuda/numba/cuda/cpython/listobj.py +++ b/numba_cuda/numba/cuda/cpython/listobj.py @@ -299,8 +299,12 @@ def define_dtor(self): mod = builder.module # Declare dtor fnty = ir.FunctionType(ir.VoidType(), [cgutils.voidptr_t]) + if isinstance(self.dtype, types.containers.List): + dtypestr = f"list_{self.dtype.dtype}" + else: + dtypestr = str(self.dtype) fn = cgutils.get_or_insert_function( - mod, fnty, "numba_cuda_dtor_list_{}".format(self.dtype) + mod, fnty, "numba_cuda_dtor_list_{}".format(dtypestr) ) if not fn.is_declaration: # End early if the dtor is already defined diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index 41c8b471a..cf829c36d 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -286,12 +286,54 @@ def kernel(out): arr = np.float64([1.0, 0.0, float("inf"), float("nan")]) self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) - # arr = np.float64([1.0, 0.0, float('inf'), float('nan')]) - # check(arr) - # arr[1] = -0.0 - # check(arr) - # arr[1] = 1.5 - # check(arr) - # arr = arr.reshape((2, 2)) - # check(arr) - # check(arr[::-1]) + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + ary = np.float64([1.0, -0.0, float("inf"), float("nan")]) + out[0] = np.all(ary) + + out = cuda.to_device(np.zeros(1, dtype=np.bool_)) + kernel[1, 1](out) + + arr = np.float64([1.0, -0.0, float("inf"), float("nan")]) + self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) + + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + ary = np.float64([1.0, 1.5, float("inf"), float("nan")]) + out[0] = np.all(ary) + + out = cuda.to_device(np.zeros(1, dtype=np.bool_)) + kernel[1, 1](out) + + arr = np.float64([1.0, 1.5, float("inf"), float("nan")]) + self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) + + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + ary = np.float64([[1.0, 1.5], [float("inf"), float("nan")]]) + out[0] = np.all(ary) + + out = cuda.to_device(np.zeros(1, dtype=np.bool_)) + kernel[1, 1](out) + + arr = np.float64([[1.0, 1.5], [float("inf"), float("nan")]]) + self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) + + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + ary = np.float64([[float("inf"), float("nan")], [1.5, 1.0]]) + out[0] = np.all(ary) + + out = cuda.to_device(np.zeros(1, dtype=np.bool_)) + kernel[1, 1](out) + + arr = np.float64([[float("inf"), float("nan")], [1.5, 1.0]]) + self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) From 9e12144be188952d5d51bc0cdc6b5569168c3e7f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 15 Oct 2025 06:04:45 -0700 Subject: [PATCH 03/21] Refactor --- .../numba/cuda/tests/test_array_reductions.py | 330 +----------------- 1 file changed, 15 insertions(+), 315 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index cf829c36d..7bb443357 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -2,240 +2,10 @@ # SPDX-License-Identifier: BSD-2-Clause import numpy as np -from numba import jit, njit from numba.tests.support import TestCase, MemoryLeakMixin from numba import cuda -def array_all(arr): - return arr.all() - - -def array_all_global(arr): - return np.all(arr) - - -def array_any(arr): - return arr.any() - - -def array_any_global(arr): - return np.any(arr) - - -def array_cumprod(arr): - return arr.cumprod() - - -def array_cumprod_global(arr): - return np.cumprod(arr) - - -def array_nancumprod(arr): - return np.nancumprod(arr) - - -def array_cumsum(arr): - return arr.cumsum() - - -def array_cumsum_global(arr): - return np.cumsum(arr) - - -def array_nancumsum(arr): - return np.nancumsum(arr) - - -def array_sum(arr): - return arr.sum() - - -def array_sum_global(arr): - return np.sum(arr) - - -def array_prod(arr): - return arr.prod() - - -def array_prod_global(arr): - return np.prod(arr) - - -def array_mean(arr): - return arr.mean() - - -def array_mean_global(arr): - return np.mean(arr) - - -def array_var(arr): - return arr.var() - - -def array_var_global(arr): - return np.var(arr) - - -def array_std(arr): - return arr.std() - - -def array_std_global(arr): - return np.std(arr) - - -def array_min(arr): - return arr.min() - - -def array_min_global(arr): - return np.min(arr) - - -def array_amin(arr): - return np.amin(arr) - - -def array_max(arr): - return arr.max() - - -def array_max_global(arr): - return np.max(arr) - - -def array_amax(arr): - return np.amax(arr) - - -def array_argmin(arr): - return arr.argmin() - - -def array_argmin_global(arr): - return np.argmin(arr) - - -def array_argmax(arr): - return arr.argmax() - - -def array_argmax_global(arr): - return np.argmax(arr) - - -def array_median_global(arr): - return np.median(arr) - - -def array_nanmin(arr): - return np.nanmin(arr) - - -def array_nanmax(arr): - return np.nanmax(arr) - - -def array_nanmean(arr): - return np.nanmean(arr) - - -def array_nansum(arr): - return np.nansum(arr) - - -def array_nanprod(arr): - return np.nanprod(arr) - - -def array_nanstd(arr): - return np.nanstd(arr) - - -def array_nanvar(arr): - return np.nanvar(arr) - - -def array_nanmedian_global(arr): - return np.nanmedian(arr) - - -def array_percentile_global(arr, q): - return np.percentile(arr, q) - - -def array_nanpercentile_global(arr, q): - return np.nanpercentile(arr, q) - - -def array_ptp_global(a): - return np.ptp(a) - - -def array_ptp(a): - return a.ptp() - - -def array_quantile_global(arr, q): - return np.quantile(arr, q) - - -def array_nanquantile_global(arr, q): - return np.nanquantile(arr, q) - - -def base_test_arrays(dtype): - if dtype == np.bool_: - - def factory(n): - assert n % 2 == 0 - return np.bool_([0, 1] * (n // 2)) - else: - - def factory(n): - return np.arange(n, dtype=dtype) + 1 - - a1 = factory(10) - a2 = factory(10).reshape(2, 5) - # The prod() of this array fits in a 32-bit int - a3 = (factory(12))[::-1].reshape((2, 3, 2), order="A") - assert not (a3.flags.c_contiguous or a3.flags.f_contiguous) - - return [a1, a2, a3] - - -def full_test_arrays(dtype): - array_list = base_test_arrays(dtype) - - # Add floats with some mantissa - if dtype == np.float32: - array_list += [a / 10 for a in array_list] - - # add imaginary part - if dtype == np.complex64: - acc = [] - for a in array_list: - tmp = a / 10 + 1j * a / 11 - tmp[::2] = np.conj(tmp[::2]) - acc.append(tmp) - array_list.extend(acc) - - for a in array_list: - assert a.dtype == np.dtype(dtype) - return array_list - - -def run_comparative(compare_func, test_array): - cfunc = njit(compare_func) - numpy_result = compare_func(test_array) - numba_result = cfunc(test_array) - - return numpy_result, numba_result - - class TestArrayReductions(MemoryLeakMixin, TestCase): """ Test array reduction methods and functions such as .sum(), .max(), etc. @@ -245,95 +15,25 @@ def setUp(self): super(TestArrayReductions, self).setUp() np.random.seed(42) - def check_reduction_basic(self, pyfunc, **kwargs): - # Basic reduction checks on 1-d float64 arrays - cfunc = jit(nopython=True)(pyfunc) - - def check(arr): - self.assertPreciseEqual(pyfunc(arr), cfunc(arr), **kwargs) - - arr = np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]) - check(arr) - arr = np.float64([-0.0, -1.5]) - check(arr) - arr = np.float64([-1.5, 2.5, "inf"]) - check(arr) - arr = np.float64([-1.5, 2.5, "-inf"]) - check(arr) - arr = np.float64([-1.5, 2.5, "inf", "-inf"]) - check(arr) - arr = np.float64(["nan", -1.5, 2.5, "nan", 3.0]) - check(arr) - arr = np.float64(["nan", -1.5, 2.5, "nan", "inf", "-inf", 3.0]) - check(arr) - arr = np.float64([5.0, "nan", -1.5, "nan"]) - check(arr) - # Only NaNs - arr = np.float64(["nan", "nan"]) - check(arr) - def test_all_basic(self): - @cuda.jit - def kernel(out): - gid = cuda.grid(1) - if gid < 1: - ary = np.float64([1.0, 0.0, float("inf"), float("nan")]) - out[0] = np.all(ary) + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.all(arr) - out = cuda.to_device(np.zeros(1, dtype=np.bool_)) - kernel[1, 1](out) + out = cuda.to_device(np.zeros(1, dtype=np.bool_)) + kernel[1, 1](out) + self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) arr = np.float64([1.0, 0.0, float("inf"), float("nan")]) - self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) - - @cuda.jit - def kernel(out): - gid = cuda.grid(1) - if gid < 1: - ary = np.float64([1.0, -0.0, float("inf"), float("nan")]) - out[0] = np.all(ary) - - out = cuda.to_device(np.zeros(1, dtype=np.bool_)) - kernel[1, 1](out) - + check(arr) arr = np.float64([1.0, -0.0, float("inf"), float("nan")]) - self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) - - @cuda.jit - def kernel(out): - gid = cuda.grid(1) - if gid < 1: - ary = np.float64([1.0, 1.5, float("inf"), float("nan")]) - out[0] = np.all(ary) - - out = cuda.to_device(np.zeros(1, dtype=np.bool_)) - kernel[1, 1](out) - + check(arr) arr = np.float64([1.0, 1.5, float("inf"), float("nan")]) - self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) - - @cuda.jit - def kernel(out): - gid = cuda.grid(1) - if gid < 1: - ary = np.float64([[1.0, 1.5], [float("inf"), float("nan")]]) - out[0] = np.all(ary) - - out = cuda.to_device(np.zeros(1, dtype=np.bool_)) - kernel[1, 1](out) - + check(arr) arr = np.float64([[1.0, 1.5], [float("inf"), float("nan")]]) - self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) - - @cuda.jit - def kernel(out): - gid = cuda.grid(1) - if gid < 1: - ary = np.float64([[float("inf"), float("nan")], [1.5, 1.0]]) - out[0] = np.all(ary) - - out = cuda.to_device(np.zeros(1, dtype=np.bool_)) - kernel[1, 1](out) - - arr = np.float64([[float("inf"), float("nan")], [1.5, 1.0]]) - self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) + check(arr) + arr = np.float64([[1.0, 1.5], [1.5, 1.0]]) + check(arr) From c4d4abe45b32545c60770a2f8c9f54fb54206dba Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 15 Oct 2025 06:14:38 -0700 Subject: [PATCH 04/21] more reductions --- .../numba/cuda/tests/test_array_reductions.py | 374 ++++++++++++++++++ 1 file changed, 374 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index 7bb443357..29df509f3 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -37,3 +37,377 @@ def kernel(out): check(arr) arr = np.float64([[1.0, 1.5], [1.5, 1.0]]) check(arr) + + def test_any_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.any(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.bool_)) + kernel[1, 1](out) + self.assertPreciseEqual(np.any(arr), out.copy_to_host()[0]) + + arr = np.float64([0.0, -0.0, 0.0, 0.0]) + check(arr) + arr[2] = float("nan") + check(arr) + arr[2] = float("inf") + check(arr) + arr[2] = 1.5 + check(arr) + arr = arr.reshape((2, 2)) + check(arr) + check(arr[::-1]) + + def test_sum_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.sum(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.sum(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64( + [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0] + ), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_mean_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.mean(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.mean(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64( + [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0] + ), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_var_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.var(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.var(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64( + [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0] + ), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_std_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.std(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.std(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64( + [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0] + ), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_min_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.min(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.min(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64( + [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0] + ), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_max_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.max(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.max(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64( + [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0] + ), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_nanmin_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.nanmin(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.nanmin(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, np.nan]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_nanmax_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.nanmax(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.nanmax(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, np.nan]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_nanmean_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.nanmean(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.nanmean(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, np.nan]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64( + [np.nan, -1.5, 2.5, np.nan, float("inf"), -float("inf"), 3.0] + ), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_nansum_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.nansum(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.nansum(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, np.nan]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_nanprod_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.nanprod(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.nanprod(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, np.nan]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_nanstd_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.nanstd(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.nanstd(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, np.nan]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) + + def test_nanvar_basic(self): + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + out[0] = np.nanvar(arr) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + self.assertPreciseEqual(np.nanvar(arr), out.copy_to_host()[0]) + + arrays = [ + np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), + np.float64([-0.0, -1.5]), + np.float64([-1.5, 2.5, np.nan]), + np.float64([-1.5, 2.5, float("inf")]), + np.float64([-1.5, 2.5, -float("inf")]), + np.float64([-1.5, 2.5, float("inf"), -float("inf")]), + np.float64([np.nan, -1.5, 2.5, np.nan, 3.0]), + np.float64([5.0, np.nan, -1.5, np.nan]), + np.float64([np.nan, np.nan]), + ] + for arr in arrays: + check(arr) From 06226f20cd6e1528e6b8dd8c3a74ab082a5c734f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 15 Oct 2025 07:52:55 -0700 Subject: [PATCH 05/21] fix ufuncs --- numba_cuda/numba/cuda/ufuncs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py index f305d3ee4..a58dc9bce 100644 --- a/numba_cuda/numba/cuda/ufuncs.py +++ b/numba_cuda/numba/cuda/ufuncs.py @@ -17,6 +17,7 @@ get_unary_impl_for_fn_and_ty, get_binary_impl_for_fn_and_ty, ) +from numba.cuda.np.ufunc_db import _fill_ufunc_db def get_ufunc_info(ufunc_key): @@ -682,4 +683,6 @@ def np_real_atanh_impl(context, builder, sig, args): "D->D": npyfuncs.np_complex_log10_impl, } + _fill_ufunc_db(db) + return db From 661180525caf2e01590b534e62f29103d448de5c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 15 Oct 2025 10:59:41 -0700 Subject: [PATCH 06/21] enable nrt --- numba_cuda/numba/cuda/tests/test_array_reductions.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index 29df509f3..617f9d753 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -4,6 +4,7 @@ from numba.tests.support import TestCase, MemoryLeakMixin from numba import cuda +from numba.cuda import config class TestArrayReductions(MemoryLeakMixin, TestCase): @@ -14,6 +15,12 @@ class TestArrayReductions(MemoryLeakMixin, TestCase): def setUp(self): super(TestArrayReductions, self).setUp() np.random.seed(42) + self.old_nrt_setting = config.CUDA_ENABLE_NRT + config.CUDA_ENABLE_NRT = True + + def tearDown(self): + config.CUDA_ENABLE_NRT = self.old_nrt_setting + super(TestArrayReductions, self).tearDown() def test_all_basic(self): def check(arr): From 7bc54a0954093247f444f1808dae9eec396e1412 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 15 Oct 2025 15:24:40 -0700 Subject: [PATCH 07/21] fixes --- numba_cuda/numba/cuda/ufuncs.py | 63 +++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py index 0a3f8c43b..6133cd25f 100644 --- a/numba_cuda/numba/cuda/ufuncs.py +++ b/numba_cuda/numba/cuda/ufuncs.py @@ -17,7 +17,6 @@ get_unary_impl_for_fn_and_ty, get_binary_impl_for_fn_and_ty, ) -from numba.cuda.np.ufunc_db import _fill_ufunc_db def get_ufunc_info(ufunc_key): @@ -683,6 +682,66 @@ def np_real_atanh_impl(context, builder, sig, args): "D->D": npyfuncs.np_complex_log10_impl, } - _fill_ufunc_db(db) + db[np.isnan] = { + "f->?": npyfuncs.np_real_isnan_impl, + "d->?": npyfuncs.np_real_isnan_impl, + "F->?": npyfuncs.np_complex_isnan_impl, + "D->?": npyfuncs.np_complex_isnan_impl, + # int8 + "b->?": npyfuncs.np_int_isnan_impl, + "B->?": npyfuncs.np_int_isnan_impl, + # int16 + "h->?": npyfuncs.np_int_isnan_impl, + "H->?": npyfuncs.np_int_isnan_impl, + # int32 + "i->?": npyfuncs.np_int_isnan_impl, + "I->?": npyfuncs.np_int_isnan_impl, + # int64 + "l->?": npyfuncs.np_int_isnan_impl, + "L->?": npyfuncs.np_int_isnan_impl, + # intp + "q->?": npyfuncs.np_int_isnan_impl, + "Q->?": npyfuncs.np_int_isnan_impl, + # boolean + "?->?": npyfuncs.np_int_isnan_impl, + # datetime & timedelta + "m->?": npyfuncs.np_datetime_isnat_impl, + "M->?": npyfuncs.np_datetime_isnat_impl, + } + + db[np.conjugate] = { + "b->b": numbers.real_conjugate_impl, + "B->B": numbers.real_conjugate_impl, + "h->h": numbers.real_conjugate_impl, + "H->H": numbers.real_conjugate_impl, + "i->i": numbers.real_conjugate_impl, + "I->I": numbers.real_conjugate_impl, + "l->l": numbers.real_conjugate_impl, + "L->L": numbers.real_conjugate_impl, + "q->q": numbers.real_conjugate_impl, + "Q->Q": numbers.real_conjugate_impl, + "f->f": numbers.real_conjugate_impl, + "d->d": numbers.real_conjugate_impl, + "F->F": numbers.complex_conjugate_impl, + "D->D": numbers.complex_conjugate_impl, + } + + if np.divide != np.true_divide: + db[np.divide] = { + "bb->b": npyfuncs.np_int_sdiv_impl, + "BB->B": npyfuncs.np_int_udiv_impl, + "hh->h": npyfuncs.np_int_sdiv_impl, + "HH->H": npyfuncs.np_int_udiv_impl, + "ii->i": npyfuncs.np_int_sdiv_impl, + "II->I": npyfuncs.np_int_udiv_impl, + "ll->l": npyfuncs.np_int_sdiv_impl, + "LL->L": npyfuncs.np_int_udiv_impl, + "qq->q": npyfuncs.np_int_sdiv_impl, + "QQ->Q": npyfuncs.np_int_udiv_impl, + "ff->f": npyfuncs.np_real_div_impl, + "dd->d": npyfuncs.np_real_div_impl, + "FF->F": npyfuncs.np_complex_div_impl, + "DD->D": npyfuncs.np_complex_div_impl, + } return db From 8f87a4425f030b2f6f0bba84995c9c873cd3a317 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 16 Oct 2025 06:07:03 -0700 Subject: [PATCH 08/21] pass --- .../numba/cuda/tests/test_array_reductions.py | 8 +++-- numba_cuda/numba/cuda/ufuncs.py | 33 +++++++++---------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index 617f9d753..d484cf5c3 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -135,7 +135,9 @@ def kernel(out): out = cuda.to_device(np.zeros(1, dtype=np.float64)) kernel[1, 1](out) - self.assertPreciseEqual(np.var(arr), out.copy_to_host()[0]) + self.assertPreciseEqual( + np.var(arr), out.copy_to_host()[0], prec="double" + ) arrays = [ np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), @@ -403,7 +405,9 @@ def kernel(out): out = cuda.to_device(np.zeros(1, dtype=np.float64)) kernel[1, 1](out) - self.assertPreciseEqual(np.nanvar(arr), out.copy_to_host()[0]) + self.assertPreciseEqual( + np.nanvar(arr), out.copy_to_host()[0], prec="double" + ) arrays = [ np.float64([1.0, 2.0, 0.0, -0.0, 1.0, -1.5]), diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py index 6133cd25f..a492f9c48 100644 --- a/numba_cuda/numba/cuda/ufuncs.py +++ b/numba_cuda/numba/cuda/ufuncs.py @@ -726,22 +726,21 @@ def np_real_atanh_impl(context, builder, sig, args): "D->D": numbers.complex_conjugate_impl, } - if np.divide != np.true_divide: - db[np.divide] = { - "bb->b": npyfuncs.np_int_sdiv_impl, - "BB->B": npyfuncs.np_int_udiv_impl, - "hh->h": npyfuncs.np_int_sdiv_impl, - "HH->H": npyfuncs.np_int_udiv_impl, - "ii->i": npyfuncs.np_int_sdiv_impl, - "II->I": npyfuncs.np_int_udiv_impl, - "ll->l": npyfuncs.np_int_sdiv_impl, - "LL->L": npyfuncs.np_int_udiv_impl, - "qq->q": npyfuncs.np_int_sdiv_impl, - "QQ->Q": npyfuncs.np_int_udiv_impl, - "ff->f": npyfuncs.np_real_div_impl, - "dd->d": npyfuncs.np_real_div_impl, - "FF->F": npyfuncs.np_complex_div_impl, - "DD->D": npyfuncs.np_complex_div_impl, - } + db[np.divide] = { + "bb->b": npyfuncs.np_int_sdiv_impl, + "BB->B": npyfuncs.np_int_udiv_impl, + "hh->h": npyfuncs.np_int_sdiv_impl, + "HH->H": npyfuncs.np_int_udiv_impl, + "ii->i": npyfuncs.np_int_sdiv_impl, + "II->I": npyfuncs.np_int_udiv_impl, + "ll->l": npyfuncs.np_int_sdiv_impl, + "LL->L": npyfuncs.np_int_udiv_impl, + "qq->q": npyfuncs.np_int_sdiv_impl, + "QQ->Q": npyfuncs.np_int_udiv_impl, + "ff->f": npyfuncs.np_real_div_impl, + "dd->d": npyfuncs.np_real_div_impl, + "FF->F": npyfuncs.np_complex_div_impl, + "DD->D": npyfuncs.np_complex_div_impl, + } return db From 996922348c2f415ef3ee450d1c0e5c4bb1cc1147 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 16 Oct 2025 06:30:42 -0700 Subject: [PATCH 09/21] faster? --- .../numba/cuda/tests/test_array_reductions.py | 54 ++++++++++++------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index d484cf5c3..44dc671a0 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -23,27 +23,41 @@ def tearDown(self): super(TestArrayReductions, self).tearDown() def test_all_basic(self): - def check(arr): - @cuda.jit - def kernel(out): - gid = cuda.grid(1) - if gid < 1: - out[0] = np.all(arr) - - out = cuda.to_device(np.zeros(1, dtype=np.bool_)) - kernel[1, 1](out) - self.assertPreciseEqual(np.all(arr), out.copy_to_host()[0]) + cases = [ + np.float64([1.0, 0.0, float("inf"), float("nan")]), + np.float64([1.0, -0.0, float("inf"), float("nan")]), + np.float64([1.0, 1.5, float("inf"), float("nan")]), + np.float64([[1.0, 1.5], [float("inf"), float("nan")]]), + np.float64([[1.0, 1.5], [1.5, 1.0]]), + ] - arr = np.float64([1.0, 0.0, float("inf"), float("nan")]) - check(arr) - arr = np.float64([1.0, -0.0, float("inf"), float("nan")]) - check(arr) - arr = np.float64([1.0, 1.5, float("inf"), float("nan")]) - check(arr) - arr = np.float64([[1.0, 1.5], [float("inf"), float("nan")]]) - check(arr) - arr = np.float64([[1.0, 1.5], [1.5, 1.0]]) - check(arr) + case_0 = cases[0] + case_1 = cases[1] + case_2 = cases[2] + case_3 = cases[3] + case_4 = cases[4] + + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid == 0: + ans = np.all(case_0) + if gid == 1: + ans = np.all(case_1) + if gid == 2: + ans = np.all(case_2) + if gid == 3: + ans = np.all(case_3) + if gid == 4: + ans = np.all(case_4) + out[gid] = ans + + expected = np.array([np.all(a) for a in cases], dtype=np.bool_) + out = cuda.to_device(np.zeros(len(cases), dtype=np.bool_)) + kernel[1, len(cases)](out) + got = out.copy_to_host() + + self.assertPreciseEqual(expected, got) def test_any_basic(self): def check(arr): From 398489b420546af394b935b6bed857ecb87ff43e Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:13:27 -0500 Subject: [PATCH 10/21] Apply suggestions from code review Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com> --- numba_cuda/numba/cuda/cpython/listobj.py | 2 +- numba_cuda/numba/cuda/memory_management/nrt.cu | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/cpython/listobj.py b/numba_cuda/numba/cuda/cpython/listobj.py index 08bc954b1..64bd749f7 100644 --- a/numba_cuda/numba/cuda/cpython/listobj.py +++ b/numba_cuda/numba/cuda/cpython/listobj.py @@ -304,7 +304,7 @@ def define_dtor(self): else: dtypestr = str(self.dtype) fn = cgutils.get_or_insert_function( - mod, fnty, "numba_cuda_dtor_list_{}".format(dtypestr) + mod, fnty, f"numba_cuda_dtor_list_{dtypestr}" ) if not fn.is_declaration: # End early if the dtor is already defined diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cu b/numba_cuda/numba/cuda/memory_management/nrt.cu index 25da96847..467e057e3 100644 --- a/numba_cuda/numba/cuda/memory_management/nrt.cu +++ b/numba_cuda/numba/cuda/memory_management/nrt.cu @@ -91,13 +91,12 @@ nrt_varsize_dtor(void *ptr, size_t size, void *info) { __device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size) { - NRT_MemInfo *mi = NULL; void *data = NRT_Allocate(size); if (data == NULL) { return NULL; /* return early as allocation failed */ } - mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL); + NRT_MemInfo *mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL); return mi; } From b58d7573a9030df1d3a112a8d39bc86904b14d9e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 22 Oct 2025 11:41:16 -0700 Subject: [PATCH 11/21] export --- numba_cuda/numba/cuda/memory_management/nrt.cu | 1 + numba_cuda/numba/cuda/memory_management/nrt.cuh | 1 + 2 files changed, 2 insertions(+) diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cu b/numba_cuda/numba/cuda/memory_management/nrt.cu index 467e057e3..8ad9828db 100644 --- a/numba_cuda/numba/cuda/memory_management/nrt.cu +++ b/numba_cuda/numba/cuda/memory_management/nrt.cu @@ -89,6 +89,7 @@ nrt_varsize_dtor(void *ptr, size_t size, void *info) { NRT_Free(ptr); } +extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size) { void *data = NRT_Allocate(size); diff --git a/numba_cuda/numba/cuda/memory_management/nrt.cuh b/numba_cuda/numba/cuda/memory_management/nrt.cuh index ab7ae5ef7..2dbfa06da 100644 --- a/numba_cuda/numba/cuda/memory_management/nrt.cuh +++ b/numba_cuda/numba/cuda/memory_management/nrt.cuh @@ -44,4 +44,5 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi, size_t size, NRT_dtor_function dtor, void* dtor_info); +extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new_varsize(size_t size); extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor); From a7e24cccd2d908671a8588b6d16fc5e430f81ece Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 23 Oct 2025 06:08:14 -0700 Subject: [PATCH 12/21] basic --- .../numba/cuda/tests/test_array_reductions.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index 44dc671a0..07ccafb64 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -7,6 +7,10 @@ from numba.cuda import config +def array_median_global(arr): + return np.median(arr) + + class TestArrayReductions(MemoryLeakMixin, TestCase): """ Test array reduction methods and functions such as .sum(), .max(), etc. @@ -436,3 +440,59 @@ def kernel(out): ] for arr in arrays: check(arr) + + def test_median_basic(self): + def variations(a): + # Sorted, reversed, random, many duplicates + yield a + a = a[::-1].copy() + yield a + np.random.shuffle(a) + yield a + a[a % 4 >= 1] = 3.5 + yield a + + self.check_median_basic(array_median_global, variations) + + def check_median_basic(self, pyfunc, array_variations): + # cfunc = jit(nopython=True)(pyfunc) + + def check(arr): + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + print(np.median(arr)) + print(arr[2]) + out[0] = np.median(arr) + + expected = pyfunc(arr) + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + + got = out.copy_to_host()[0] + + self.assertPreciseEqual(expected, got) + + # Empty array case + # check(np.array([])) + + # Odd sizes + def check_odd(a): + check(a) + a = a.reshape((9, 7)) + check(a) + check(a.T) + + for a in array_variations(np.arange(63) + 10.5): + check_odd(a) + + # Even sizes + def check_even(a): + check(a) + a = a.reshape((4, 16)) + check(a) + check(a.T) + + for a in array_variations(np.arange(64) + 10.5): + check_even(a) From 14026e81929713eb8a8754489f42d5bdbeab2275 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 23 Oct 2025 06:17:12 -0700 Subject: [PATCH 13/21] add test that exhibits the failure --- .../numba/cuda/tests/test_array_methods.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 numba_cuda/numba/cuda/tests/test_array_methods.py diff --git a/numba_cuda/numba/cuda/tests/test_array_methods.py b/numba_cuda/numba/cuda/tests/test_array_methods.py new file mode 100644 index 000000000..407a44792 --- /dev/null +++ b/numba_cuda/numba/cuda/tests/test_array_methods.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause +import numpy as np + +from numba.tests.support import TestCase, MemoryLeakMixin +from numba import cuda +from numba.cuda import config + + +class TestArrayMethods(MemoryLeakMixin, TestCase): + """ + Test array reduction methods and functions such as .sum(), .max(), etc. + """ + + def setUp(self): + super(TestArrayMethods, self).setUp() + np.random.seed(42) + self.old_nrt_setting = config.CUDA_ENABLE_NRT + config.CUDA_ENABLE_NRT = True + + def tearDown(self): + config.CUDA_ENABLE_NRT = self.old_nrt_setting + super(TestArrayMethods, self).tearDown() + + def test_array_copy(self): + ary = np.array([1.0, 2.0, 3.0]) + out = cuda.to_device(np.zeros(3)) + + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + cpy = ary.copy() + for i in range(len(out)): + out[i] = cpy[i] + + kernel[1, 1](out) + + result = out.copy_to_host() + np.testing.assert_array_equal(result, ary) From 10f21fd6e4b73bbd89a026008d3c563129897712 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 23 Oct 2025 06:17:51 -0700 Subject: [PATCH 14/21] implement fix --- numba_cuda/numba/cuda/np/arrayobj.py | 52 ++++++++++++---------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py index 3e06d84cd..76703ef50 100644 --- a/numba_cuda/numba/cuda/np/arrayobj.py +++ b/numba_cuda/numba/cuda/np/arrayobj.py @@ -5455,37 +5455,31 @@ def _array_copy(context, builder, sig, args): dest_data = ret.data assert rettype.layout in "CF" - if arytype.layout == rettype.layout: - # Fast path: memcpy - cgutils.raw_memcpy( - builder, dest_data, src_data, ary.nitems, ary.itemsize, align=1 - ) - else: - src_strides = cgutils.unpack_tuple(builder, ary.strides) - dest_strides = cgutils.unpack_tuple(builder, ret.strides) - intp_t = context.get_value_type(types.intp) + src_strides = cgutils.unpack_tuple(builder, ary.strides) + dest_strides = cgutils.unpack_tuple(builder, ret.strides) + intp_t = context.get_value_type(types.intp) - with cgutils.loop_nest(builder, shapes, intp_t) as indices: - src_ptr = cgutils.get_item_pointer2( - context, - builder, - src_data, - shapes, - src_strides, - arytype.layout, - indices, - ) - dest_ptr = cgutils.get_item_pointer2( - context, - builder, - dest_data, - shapes, - dest_strides, - rettype.layout, - indices, - ) - builder.store(builder.load(src_ptr), dest_ptr) + with cgutils.loop_nest(builder, shapes, intp_t) as indices: + src_ptr = cgutils.get_item_pointer2( + context, + builder, + src_data, + shapes, + src_strides, + arytype.layout, + indices, + ) + dest_ptr = cgutils.get_item_pointer2( + context, + builder, + dest_data, + shapes, + dest_strides, + rettype.layout, + indices, + ) + builder.store(builder.load(src_ptr), dest_ptr) return impl_ret_new_ref(context, builder, sig.return_type, ret._getvalue()) From 054041fa018854d59bb011d328d31407196bf176 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 27 Oct 2025 04:55:58 -0700 Subject: [PATCH 15/21] updates --- numba_cuda/numba/cuda/tests/test_array_reductions.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/test_array_reductions.py index 07ccafb64..f83e43f0c 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/test_array_reductions.py @@ -460,11 +460,7 @@ def check_median_basic(self, pyfunc, array_variations): def check(arr): @cuda.jit def kernel(out): - gid = cuda.grid(1) - if gid < 1: - print(np.median(arr)) - print(arr[2]) - out[0] = np.median(arr) + out[0] = np.median(arr) expected = pyfunc(arr) out = cuda.to_device(np.zeros(1, dtype=np.float64)) From 0595cb9439f555603a374c13cfa79fb9a34ac574 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 31 Oct 2025 08:13:32 -0700 Subject: [PATCH 16/21] move files, add NRTEnablingCUDATestCase --- numba_cuda/numba/cuda/testing.py | 11 ++++++++++ .../cuda/tests/cudapy/test_array_methods.py | 21 +++++++++++++++++-- .../{ => cudapy}/test_array_reductions.py | 5 +++-- 3 files changed, 33 insertions(+), 4 deletions(-) rename numba_cuda/numba/cuda/tests/{ => cudapy}/test_array_reductions.py (98%) diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py index 196d08897..4ed672828 100644 --- a/numba_cuda/numba/cuda/testing.py +++ b/numba_cuda/numba/cuda/testing.py @@ -184,6 +184,17 @@ def assertFileCheckMatches( ) +class NRTEnablingCUDATestCase(CUDATestCase): + def setUp(self): + self.old_nrt_setting = config.CUDA_ENABLE_NRT + config.CUDA_ENABLE_NRT = True + super().setUp() + + def tearDown(self): + config.CUDA_ENABLE_NRT = self.old_nrt_setting + super().tearDown() + + def skip_on_cudasim(reason): """Skip this test if running on the CUDA simulator""" return unittest.skipIf(config.ENABLE_CUDASIM, reason) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py index 9b5873125..d8ab29d79 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py @@ -3,7 +3,7 @@ import numpy as np from numba import cuda -from numba.cuda.testing import CUDATestCase +from numba.cuda.testing import NRTEnablingCUDATestCase import unittest @@ -13,7 +13,7 @@ def reinterpret_array_type(byte_arr, start, stop, output): output[0] = val -class TestCudaArrayMethods(CUDATestCase): +class TestCudaArrayMethods(NRTEnablingCUDATestCase): def test_reinterpret_array_type(self): """ Reinterpret byte array as int32 in the GPU. @@ -33,6 +33,23 @@ def test_reinterpret_array_type(self): got = output[0] self.assertEqual(expect, got) + def test_array_copy(self): + ary = np.array([1.0, 2.0, 3.0]) + out = cuda.to_device(np.zeros(3)) + + @cuda.jit + def kernel(out): + gid = cuda.grid(1) + if gid < 1: + cpy = ary.copy() + for i in range(len(out)): + out[i] = cpy[i] + + kernel[1, 1](out) + + result = out.copy_to_host() + np.testing.assert_array_equal(result, ary) + if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py similarity index 98% rename from numba_cuda/numba/cuda/tests/test_array_reductions.py rename to numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py index 7fc483d98..9d8f02618 100644 --- a/numba_cuda/numba/cuda/tests/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py @@ -2,7 +2,8 @@ # SPDX-License-Identifier: BSD-2-Clause import numpy as np -from numba.tests.support import TestCase, MemoryLeakMixin +from numba.tests.support import MemoryLeakMixin +from numba.cuda.testing import NRTEnablingCUDATestCase from numba import cuda @@ -14,7 +15,7 @@ def array_median_global(arr): return np.median(arr) -class TestArrayReductions(MemoryLeakMixin, TestCase): +class TestArrayReductions(MemoryLeakMixin, NRTEnablingCUDATestCase): """ Test array reduction methods and functions such as .sum(), .max(), etc. """ From 108df98e0a5b592c0aa3e43eac359a69990b7f3a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 31 Oct 2025 08:58:02 -0700 Subject: [PATCH 17/21] cleanup --- numba_cuda/numba/cuda/target.py | 4 ++-- numba_cuda/numba/cuda/typing/context.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index 84f5fcb9b..6faf15571 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -36,7 +36,7 @@ # Typing -class CUDATypingContext(typing.Context): +class CUDATypingContext(typing.BaseContext): def load_additional_registries(self): from . import ( cudadecl, @@ -58,7 +58,6 @@ def load_additional_registries(self): self.install_registry(vector_types.typing_registry) self.install_registry(fp16.typing_registry) self.install_registry(bf16.typing_registry) - super().load_additional_registries() def resolve_value_type(self, val): # treat other dispatcher object as another device function @@ -182,6 +181,7 @@ def load_additional_registries(self): from numba.cuda.core import optional # noqa: F401 from numba.cuda.misc import cffiimpl from numba.cuda.np import arrayobj, npdatetime, polynomial, arraymath + from . import ( cudaimpl, fp16, diff --git a/numba_cuda/numba/cuda/typing/context.py b/numba_cuda/numba/cuda/typing/context.py index fbb471004..6c2c933bf 100644 --- a/numba_cuda/numba/cuda/typing/context.py +++ b/numba_cuda/numba/cuda/typing/context.py @@ -491,7 +491,7 @@ def is_external(obj): else: # A type was already inserted, see if we can add to it newty = existing.augment(gty) - if newty is None and existing != gty: + if newty is None: raise TypeError( "cannot augment %s with %s" % (existing, gty) ) From dd1de26056983e9a54e12e8a56d784a033d36e2a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 31 Oct 2025 11:14:17 -0700 Subject: [PATCH 18/21] more tests, buggy copy exposed again --- .../tests/cudapy/test_array_reductions.py | 144 +++++++++++++++++- numba_cuda/numba/cuda/ufuncs.py | 45 ++++++ 2 files changed, 187 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py index 9d8f02618..3698ff002 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py @@ -6,7 +6,7 @@ from numba.cuda.testing import NRTEnablingCUDATestCase from numba import cuda - +from itertools import combinations_with_replacement from numba.cuda.misc.special import literal_unroll from numba.cuda import config @@ -391,7 +391,7 @@ def kernel(out): self.assertPreciseEqual(expected, got) # Empty array case - # check(np.array([])) + check(np.array([])) # Odd sizes def check_odd(a): @@ -412,3 +412,143 @@ def check_even(a): for a in array_variations(np.arange(64) + 10.5): check_even(a) + + def check_percentile(self, pyfunc, q_upper_bound): + def check(a, q, abs_tol=1e-12): + @cuda.jit + def kernel(out): + result = np.percentile(a, q) + for i in range(len(out)): + out[i] = result[i] + + out = cuda.to_device(np.zeros(len(q), dtype=np.float64)) + kernel[1, 1](out) + + expected = np.percentile(a, q) + got = out.copy_to_host() + + finite = np.isfinite(expected) + if np.all(finite): + self.assertPreciseEqual(got, expected, abs_tol=abs_tol) + else: + self.assertPreciseEqual( + got[finite], expected[finite], abs_tol=abs_tol + ) + + a = self.random.randn(27).reshape(3, 3, 3) + q = np.linspace(0, q_upper_bound, 14)[::-1] + + check(a, q) + check(a, 0) + check(a, q_upper_bound / 2) + check(a, q_upper_bound) + + not_finite = [np.nan, -np.inf, np.inf] + a.flat[:10] = self.random.choice(not_finite, 10) + self.random.shuffle(a) + self.random.shuffle(q) + check(a, q) + + a = a.flatten().tolist() + q = q.flatten().tolist() + check(a, q) + check(tuple(a), tuple(q)) + + a = self.random.choice([1, 2, 3, 4], 10) + q = np.linspace(0, q_upper_bound, 5) + check(a, q) + + # tests inspired by + # https://github.com/numpy/numpy/blob/345b2f6e/numpy/lib/tests/test_function_base.py + x = np.arange(8) * 0.5 + np.testing.assert_equal(np.percentile(x, 0), 0.0) + np.testing.assert_equal(np.percentile(x, q_upper_bound), 3.5) + np.testing.assert_equal(np.percentile(x, q_upper_bound / 2), 1.75) + + x = np.arange(12).reshape(3, 4) + q = np.array((0.25, 0.5, 1.0)) * q_upper_bound + np.testing.assert_equal(np.percentile(x, q), [2.75, 5.5, 11.0]) + + x = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6) + q = np.array((0.25, 0.50)) * q_upper_bound + np.testing.assert_equal(np.percentile(x, q).shape, (2,)) + + q = np.array((0.25, 0.50, 0.75)) * q_upper_bound + np.testing.assert_equal(np.percentile(x, q).shape, (3,)) + + x = np.arange(12).reshape(3, 4) + np.testing.assert_equal(np.percentile(x, q_upper_bound / 2), 5.5) + self.assertTrue(np.isscalar(np.percentile(x, q_upper_bound / 2))) + + np.testing.assert_equal(np.percentile([1, 2, 3], 0), 1) + + a = np.array([2, 3, 4, 1]) + np.percentile(a, [q_upper_bound / 2]) + np.testing.assert_equal(a, np.array([2, 3, 4, 1])) + + def test_percentile_basic(self): + pyfunc = np.percentile + self.check_percentile(pyfunc, q_upper_bound=100) + # self.check_percentile_edge_cases(pyfunc, q_upper_bound=100) + # self.check_percentile_exceptions(pyfunc) + + def check_percentile_edge_cases(self, pyfunc, q_upper_bound=100): + def check(a, q, abs_tol=1e-14): + @cuda.jit + def kernel(out): + result = np.percentile(a, q) + for i in range(len(out)): + out[i] = result[i] + + out = cuda.to_device(np.zeros(len(q), dtype=np.float64)) + kernel[1, 1](out) + expected = np.percentile(a, q) + + got = out.copy_to_host() + finite = np.isfinite(expected) + + if np.all(finite): + self.assertPreciseEqual(got, expected, abs_tol=abs_tol) + else: + self.assertPreciseEqual( + got[finite], expected[finite], abs_tol=abs_tol + ) + + def convert_to_float_and_check(a, q, abs_tol=1e-14): + expected = pyfunc(a, q).astype(np.float64) + got = np.percentile(a, q) + self.assertPreciseEqual(got, expected, abs_tol=abs_tol) + + def _array_combinations(elements): + for i in range(1, 10): + for comb in combinations_with_replacement(elements, i): + yield np.array(comb) + + # high number of combinations, many including non-finite values + q = (0, 0.1 * q_upper_bound, 0.2 * q_upper_bound, q_upper_bound) + element_pool = (1, -1, np.nan, np.inf, -np.inf) + for a in _array_combinations(element_pool): + check(a, q) + + # edge cases - numpy exhibits behavioural differences across + # platforms, see: https://github.com/numpy/numpy/issues/13272 + if q_upper_bound == 1: + _check = convert_to_float_and_check + else: + _check = check + + a = np.array(5) + q = np.array(1) + _check(a, q) + + a = 5 + q = q_upper_bound / 2 + _check(a, q) + + def check_percentile_exceptions(self, pyfunc): + # TODO + pass + + def check_quantile_exceptions(self, pyfunc): + # TODO + pass diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py index dc86a455b..9c1728225 100644 --- a/numba_cuda/numba/cuda/ufuncs.py +++ b/numba_cuda/numba/cuda/ufuncs.py @@ -743,4 +743,49 @@ def np_real_atanh_impl(context, builder, sig, args): "DD->D": npyfuncs.np_complex_div_impl, } + db[np.isfinite] = { + "f->?": npyfuncs.np_real_isfinite_impl, + "d->?": npyfuncs.np_real_isfinite_impl, + "F->?": npyfuncs.np_complex_isfinite_impl, + "D->?": npyfuncs.np_complex_isfinite_impl, + # int8 + "b->?": npyfuncs.np_int_isfinite_impl, + "B->?": npyfuncs.np_int_isfinite_impl, + # int16 + "h->?": npyfuncs.np_int_isfinite_impl, + "H->?": npyfuncs.np_int_isfinite_impl, + # int32 + "i->?": npyfuncs.np_int_isfinite_impl, + "I->?": npyfuncs.np_int_isfinite_impl, + # int64 + "l->?": npyfuncs.np_int_isfinite_impl, + "L->?": npyfuncs.np_int_isfinite_impl, + # intp + "q->?": npyfuncs.np_int_isfinite_impl, + "Q->?": npyfuncs.np_int_isfinite_impl, + # boolean + "?->?": npyfuncs.np_int_isfinite_impl, + # datetime & timedelta + "M->?": npyfuncs.np_datetime_isfinite_impl, + "m->?": npyfuncs.np_datetime_isfinite_impl, + } + + db[np.multiply] = { + "??->?": numbers.int_and_impl, + "bb->b": numbers.int_mul_impl, + "BB->B": numbers.int_mul_impl, + "hh->h": numbers.int_mul_impl, + "HH->H": numbers.int_mul_impl, + "ii->i": numbers.int_mul_impl, + "II->I": numbers.int_mul_impl, + "ll->l": numbers.int_mul_impl, + "LL->L": numbers.int_mul_impl, + "qq->q": numbers.int_mul_impl, + "QQ->Q": numbers.int_mul_impl, + "ff->f": numbers.real_mul_impl, + "dd->d": numbers.real_mul_impl, + "FF->F": numbers.complex_mul_impl, + "DD->D": numbers.complex_mul_impl, + } + return db From 5aa441ac63f015fc92a54b29ee3251c3eba108e3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 4 Nov 2025 04:49:20 -0800 Subject: [PATCH 19/21] partial list impl --- numba_cuda/numba/cuda/np/arrayobj.py | 6 ++++ numba_cuda/numba/cuda/target.py | 28 ++++++++++++++- .../tests/cudapy/test_array_reductions.py | 34 +++++++++++++------ 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py index bd059250b..19901fcce 100644 --- a/numba_cuda/numba/cuda/np/arrayobj.py +++ b/numba_cuda/numba/cuda/np/arrayobj.py @@ -3646,6 +3646,12 @@ def constant_array(context, builder, ty, pyval): """ return context.make_constant_array(builder, ty, pyval) +@lower_constant(types.List) +def constant_list(context, builder, ty, pyval): + """ + Create a constant list (mechanism is target-dependent). + """ + return context.make_constant_list(builder, ty, pyval) @lower_constant(types.Record) def constant_record(context, builder, ty, pyval): diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index 6faf15571..1c0ad582a 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -278,12 +278,38 @@ def mangler(self, name, argtypes, *, abi_tags=(), uid=None): name, argtypes, abi_tags=abi_tags, uid=uid ) + def make_constant_list(self, builder, listty, lst): + import numpy as np + constvals = [ + self.get_constant(listty.dtype, i) + for i in iter(np.array(lst)) + ] + instance = self.build_list(builder, listty, constvals) + # create constant address space version of the list + lmod = builder.module + + constlistty = instance.type + constlist = ir.Constant(constlistty, instance) + addrspace = nvvm.ADDRSPACE_CONSTANT + gv = cgutils.add_global_variable( + lmod, constlist.type, "_cudapy_clist", addrspace=addrspace + ) + gv.linkage = "internal" + gv.global_constant = True + gv.initializer = constlist + + # Convert to generic address-space + ptrty = ir.PointerType(constlistty) + genptr = builder.addrspacecast(gv, ptrty, "generic") + lst = cgutils.create_struct_proxy(listty)(self, builder, value=builder.load(genptr)) + return lst._getvalue() + + def make_constant_array(self, builder, aryty, arr): """ Unlike the parent version. This returns a a pointer in the constant addrspace. """ - lmod = builder.module constvals = [ diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py index 3698ff002..3be3f494a 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py @@ -414,7 +414,7 @@ def check_even(a): check_even(a) def check_percentile(self, pyfunc, q_upper_bound): - def check(a, q, abs_tol=1e-12): + def check_array_q(a, q, abs_tol=1e-12): @cuda.jit def kernel(out): result = np.percentile(a, q) @@ -435,28 +435,42 @@ def kernel(out): got[finite], expected[finite], abs_tol=abs_tol ) + def check_scalar_q(a, q, abs_tol=1e-12): + @cuda.jit + def kernel(out): + out[0] = np.percentile(a, q) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + kernel[1, 1](out) + + expected = np.percentile(a, q) + got = out.copy_to_host()[0] + + if np.isfinite(expected): + self.assertPreciseEqual(got, expected, abs_tol=abs_tol) + a = self.random.randn(27).reshape(3, 3, 3) - q = np.linspace(0, q_upper_bound, 14)[::-1] + q = np.linspace(0, q_upper_bound, 14)[::-1].copy() - check(a, q) - check(a, 0) - check(a, q_upper_bound / 2) - check(a, q_upper_bound) + check_array_q(a, q) + check_scalar_q(a, 0) + check_scalar_q(a, q_upper_bound / 2) + check_scalar_q(a, q_upper_bound) not_finite = [np.nan, -np.inf, np.inf] a.flat[:10] = self.random.choice(not_finite, 10) self.random.shuffle(a) self.random.shuffle(q) - check(a, q) + check_array_q(a, q) a = a.flatten().tolist() q = q.flatten().tolist() - check(a, q) - check(tuple(a), tuple(q)) + check_array_q(a, q) + #check(tuple(a), tuple(q)) a = self.random.choice([1, 2, 3, 4], 10) q = np.linspace(0, q_upper_bound, 5) - check(a, q) + check_array_q(a, q) # tests inspired by # https://github.com/numpy/numpy/blob/345b2f6e/numpy/lib/tests/test_function_base.py From 5ab95bc6c4689d1cdfd038c2270610f87a08bef8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 4 Nov 2025 18:45:40 -0800 Subject: [PATCH 20/21] more tests --- .../tests/cudapy/test_array_reductions.py | 101 ++++++++++++++++-- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py index 3cfe0c5a4..23d4e006f 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py @@ -10,6 +10,7 @@ from numba.cuda.testing import skip_on_cudasim from numba.cuda.misc.special import literal_unroll from numba.cuda import config +import unittest def array_median_global(arr): @@ -422,14 +423,14 @@ def check_percentile(self, pyfunc, q_upper_bound): def check_array_q(a, q, abs_tol=1e-12): @cuda.jit def kernel(out): - result = np.percentile(a, q) + result = pyfunc(a, q) for i in range(len(out)): out[i] = result[i] out = cuda.to_device(np.zeros(len(q), dtype=np.float64)) kernel[1, 1](out) - expected = np.percentile(a, q) + expected = pyfunc(a, q) got = out.copy_to_host() finite = np.isfinite(expected) @@ -443,12 +444,12 @@ def kernel(out): def check_scalar_q(a, q, abs_tol=1e-12): @cuda.jit def kernel(out): - out[0] = np.percentile(a, q) + out[0] = pyfunc(a, q) out = cuda.to_device(np.zeros(1, dtype=np.float64)) kernel[1, 1](out) - expected = np.percentile(a, q) + expected = pyfunc(a, q) got = out.copy_to_host()[0] if np.isfinite(expected): @@ -481,9 +482,13 @@ def kernel(out): def test_percentile_basic(self): pyfunc = np.percentile - # self.check_percentile(pyfunc, q_upper_bound=100) + self.check_percentile(pyfunc, q_upper_bound=100) self.check_percentile_edge_cases(pyfunc, q_upper_bound=100) - # self.check_percentile_exceptions(pyfunc) + + @unittest.expectedFailure + def test_percentile_exceptions(self): + pyfunc = np.percentile + self.check_percentile_exceptions(pyfunc) def check_percentile_edge_cases(self, pyfunc, q_upper_bound=100): # intended to be a faitful reproduction of the upstream numba test @@ -541,9 +546,85 @@ def kernel(a_batch, lengths, q_arr, out): ) def check_percentile_exceptions(self, pyfunc): - # TODO - pass + def check_scalar_q_err(a, q, abs_tol=1e-12): + @cuda.jit + def kernel(out): + out[0] = np.percentile(a, q) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + with self.assertRaises(ValueError) as raises: + kernel[1, 1](out) + self.assertEqual( + "Percentiles must be in the range [0, 100]", + str(raises.exception), + ) + + # Exceptions leak references + self.disable_leak_check() + a = np.arange(5) + check_scalar_q_err(a, -5) # q less than 0 + check_scalar_q_err(a, 105) + check_scalar_q_err(a, np.nan) + + # complex typing failure + @cuda.jit + def kernel(out): + np.percentile(a, q) + + a = np.arange(5) * 1j + q = 0.1 + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + with self.assertTypingError(): + kernel[1, 1](out) + @unittest.expectedFailure def check_quantile_exceptions(self, pyfunc): - # TODO - pass + def check_scalar_q_err(a, q, abs_tol=1e-12): + @cuda.jit + def kernel(out): + out[0] = np.percentile(a, q) + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + with self.assertRaises(ValueError) as raises: + kernel[1, 1](out) + self.assertEqual( + "Quantiles must be in the range [0, 1]", + str(raises.exception), + ) + + # Exceptions leak references + self.disable_leak_check() + a = np.arange(5) + check_scalar_q_err(a, -0.5) # q less than 0 + check_scalar_q_err(a, 1.05) + check_scalar_q_err(a, np.nan) + + # complex typing failure + @cuda.jit + def kernel(out): + np.quantile(a, q) + + a = np.arange(5) * 1j + q = 0.1 + + out = cuda.to_device(np.zeros(1, dtype=np.float64)) + with self.assertTypingError(): + kernel[1, 1](out) + + def test_quantile_basic(self): + pyfunc = np.quantile + self.check_percentile(pyfunc, q_upper_bound=1) + self.check_percentile_edge_cases(pyfunc, q_upper_bound=1) + + def test_nanpercentile_basic(self): + pyfunc = np.nanpercentile + self.check_percentile(pyfunc, q_upper_bound=100) + self.check_percentile_edge_cases(pyfunc, q_upper_bound=100) + self.check_percentile_exceptions(pyfunc) + + def test_nanquantile_basic(self): + pyfunc = np.nanquantile + self.check_percentile(pyfunc, q_upper_bound=1) + self.check_percentile_edge_cases(pyfunc, q_upper_bound=1) + self.check_quantile_exceptions(pyfunc) From 12ef05804ce7a640ec41f16fef14e3c88897a400 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 12 Nov 2025 12:28:05 -0800 Subject: [PATCH 21/21] renaming --- .../tests/cudapy/test_array_reductions.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py index 23d4e006f..60bd07ddc 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py @@ -419,7 +419,7 @@ def check_even(a): for a in array_variations(np.arange(64) + 10.5): check_even(a) - def check_percentile(self, pyfunc, q_upper_bound): + def check_percentile_and_quantile(self, pyfunc, q_upper_bound): def check_array_q(a, q, abs_tol=1e-12): @cuda.jit def kernel(out): @@ -482,15 +482,17 @@ def kernel(out): def test_percentile_basic(self): pyfunc = np.percentile - self.check_percentile(pyfunc, q_upper_bound=100) - self.check_percentile_edge_cases(pyfunc, q_upper_bound=100) + self.check_percentile_and_quantile(pyfunc, q_upper_bound=100) + self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=100) @unittest.expectedFailure def test_percentile_exceptions(self): pyfunc = np.percentile - self.check_percentile_exceptions(pyfunc) + self.check_percentile_and_quantile_exceptions(pyfunc) - def check_percentile_edge_cases(self, pyfunc, q_upper_bound=100): + def check_percentile_and_quantile_edge_cases( + self, pyfunc, q_upper_bound=100 + ): # intended to be a faitful reproduction of the upstream numba test # packing all the test cases into a single kernel for perf def _array_combinations(elements): @@ -545,7 +547,7 @@ def kernel(a_batch, lengths, q_arr, out): got[i][finite], expected[finite], abs_tol=1e-14 ) - def check_percentile_exceptions(self, pyfunc): + def check_percentile_and_quantile_exceptions(self, pyfunc): def check_scalar_q_err(a, q, abs_tol=1e-12): @cuda.jit def kernel(out): @@ -614,17 +616,17 @@ def kernel(out): def test_quantile_basic(self): pyfunc = np.quantile - self.check_percentile(pyfunc, q_upper_bound=1) - self.check_percentile_edge_cases(pyfunc, q_upper_bound=1) + self.check_percentile_and_quantile(pyfunc, q_upper_bound=1) + self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=1) def test_nanpercentile_basic(self): pyfunc = np.nanpercentile - self.check_percentile(pyfunc, q_upper_bound=100) - self.check_percentile_edge_cases(pyfunc, q_upper_bound=100) - self.check_percentile_exceptions(pyfunc) + self.check_percentile_and_quantile(pyfunc, q_upper_bound=100) + self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=100) + self.check_percentile_and_quantile_exceptions(pyfunc) def test_nanquantile_basic(self): pyfunc = np.nanquantile - self.check_percentile(pyfunc, q_upper_bound=1) - self.check_percentile_edge_cases(pyfunc, q_upper_bound=1) + self.check_percentile_and_quantile(pyfunc, q_upper_bound=1) + self.check_percentile_and_quantile_edge_cases(pyfunc, q_upper_bound=1) self.check_quantile_exceptions(pyfunc)