From 13395d4af1173c872288792fe4579676c9f8aa1f Mon Sep 17 00:00:00 2001
From: Atmn Patel <atmnp@nvidia.com>
Date: Mon, 29 Sep 2025 09:21:08 -0700
Subject: [PATCH 1/3] [Testing] Add some numpy array testing for the CUDA
 target

---
 numba_cuda/numba/cuda/__init__.py             |   6 +
 numba_cuda/numba/cuda/np/arrayobj.py          |   6 +-
 numba_cuda/numba/cuda/simulator/api.py        |   4 +
 numba_cuda/numba/cuda/target.py               |   9 +-
 .../numba/cuda/tests/test_array_attr.py       | 424 ++++++++++++
 .../numba/cuda/tests/test_array_constants.py  | 275 ++++++++
 .../numba/cuda/tests/test_array_iterators.py  | 638 ++++++++++++++++++
 numba_cuda/numba/cuda/typing/context.py       |   8 +-
 numba_cuda/numba/cuda/ufuncs.py               |   8 +-
 9 files changed, 1368 insertions(+), 10 deletions(-)
 create mode 100644 numba_cuda/numba/cuda/tests/test_array_attr.py
 create mode 100644 numba_cuda/numba/cuda/tests/test_array_constants.py
 create mode 100644 numba_cuda/numba/cuda/tests/test_array_iterators.py

diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index 64dd06e65..8ff7391a8 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -60,3 +60,9 @@
     )
 
 from numba.cuda.np.ufunc import vectorize, guvectorize
+
+# Re-export typeof
+from numba.cuda.misc.special import (
+    literally,
+    literal_unroll,
+)
diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py
index 3e06d84cd..303376f72 100644
--- a/numba_cuda/numba/cuda/np/arrayobj.py
+++ b/numba_cuda/numba/cuda/np/arrayobj.py
@@ -16,9 +16,9 @@
 
 import numpy as np
 
-from numba import pndindex, literal_unroll
-from numba.core import types, errors
-from numba.cuda import typing
+from numba import pndindex
+from numba.cuda import literal_unroll
+from numba.core import types, typing, errors
 from numba.cuda import cgutils, extending
 from numba.cuda.np.numpy_support import (
     as_dtype,
diff --git a/numba_cuda/numba/cuda/simulator/api.py b/numba_cuda/numba/cuda/simulator/api.py
index 9d22d2845..c54aadef7 100644
--- a/numba_cuda/numba/cuda/simulator/api.py
+++ b/numba_cuda/numba/cuda/simulator/api.py
@@ -161,3 +161,7 @@ def jitwrapper(fn):
 def defer_cleanup():
     # No effect for simulator
     yield
+
+
+class grid(object):
+    pass
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 31e5f0f18..5caa07595 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -36,7 +36,7 @@
 # Typing
 
 
-class CUDATypingContext(typing.BaseContext):
+class CUDATypingContext(typing.Context):
     def load_additional_registries(self):
         from . import (
             cudadecl,
@@ -46,7 +46,7 @@ def load_additional_registries(self):
             libdevicedecl,
             vector_types,
         )
-        from numba.cuda.typing import enumdecl, cffi_utils
+        from numba.cuda.typing import enumdecl, cffi_utils, npydecl
 
         self.install_registry(cudadecl.registry)
         self.install_registry(cffi_utils.registry)
@@ -57,6 +57,7 @@ def load_additional_registries(self):
         self.install_registry(vector_types.typing_registry)
         self.install_registry(fp16.typing_registry)
         self.install_registry(bf16.typing_registry)
+        self.install_registry(npydecl.registry)
 
     def resolve_value_type(self, val):
         # treat other dispatcher object as another device function
@@ -182,6 +183,8 @@ def load_additional_registries(self):
             arrayobj,
             npdatetime,
             polynomial,
+            arraymath,
+            npyimpl,
         )
         from . import (
             cudaimpl,
@@ -222,6 +225,8 @@ def load_additional_registries(self):
         self.install_registry(polynomial.registry)
         self.install_registry(npdatetime.registry)
         self.install_registry(arrayobj.registry)
+        self.install_registry(arraymath.registry)
+        self.install_registry(npyimpl.registry)
 
         # Install only implementations that are defined outside of numba (i.e.,
         # in third-party extensions) from Numba's builtin_registry.
diff --git a/numba_cuda/numba/cuda/tests/test_array_attr.py b/numba_cuda/numba/cuda/tests/test_array_attr.py
new file mode 100644
index 000000000..8f52a7c4e
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_array_attr.py
@@ -0,0 +1,424 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+import numpy as np
+
+import unittest
+from numba.cuda.np.numpy_support import from_dtype
+from numba import typeof
+from numba.cuda import jit
+from numba.core import types
+from numba.cuda.tests.support import MemoryLeakMixin
+from numba.cuda.testing import CUDATestCase
+from numba.core.errors import TypingError
+from numba.cuda.tests.support import override_config
+
+
+def array_itemsize(a, res):
+    res[0] = a.itemsize
+
+
+def array_nbytes(a, res):
+    res[0] = a.nbytes
+
+
+def array_shape(a, i, res):
+    res[0] = a.shape[i]
+
+
+def array_strides(a, i, res):
+    res[0] = a.strides[i]
+
+
+def array_ndim(a, res):
+    res[0] = a.ndim
+
+
+def array_size(a, res):
+    res[0] = a.size
+
+
+def array_flags_contiguous(a, res):
+    res[0] = a.flags.contiguous
+
+
+def array_flags_c_contiguous(a, res):
+    res[0] = a.flags.c_contiguous
+
+
+def array_flags_f_contiguous(a, res):
+    res[0] = a.flags.f_contiguous
+
+
+def nested_array_itemsize(a, res):
+    res[0] = a.f.itemsize
+
+
+def nested_array_nbytes(a, res):
+    res[0] = a.f.nbytes
+
+
+def nested_array_shape(a, res):
+    res[0] = a.f.shape[0]
+    res[1] = a.f.shape[1]
+
+
+def nested_array_strides(a, res):
+    res[0] = a.f.strides[0]
+    res[1] = a.f.strides[1]
+
+
+def nested_array_ndim(a, res):
+    res[0] = a.f.ndim
+
+
+def nested_array_size(a, res):
+    res[0] = a.f.size
+
+
+def size_after_slicing_usecase(buf, i, res):
+    sliced = buf[i]
+    # Make sure size attribute is not lost
+    res[0] = sliced.size
+
+
+def array_real(arr, res):
+    if arr.ndim == 1:
+        for i in range(arr.shape[0]):
+            res[i] = arr.real[i]
+    else:
+        for i in range(arr.shape[0]):
+            for j in range(arr.shape[1]):
+                res[i, j] = arr.real[i, j]
+
+
+def array_imag(arr, res):
+    if arr.ndim == 1:
+        for i in range(arr.shape[0]):
+            res[i] = arr.imag[i]
+    else:
+        for i in range(arr.shape[0]):
+            for j in range(arr.shape[1]):
+                res[i, j] = arr.imag[i, j]
+
+
+class TestArrayAttr(MemoryLeakMixin, CUDATestCase):
+    def setUp(self):
+        super(TestArrayAttr, self).setUp()
+        self.a = np.arange(20, dtype=np.int32).reshape(4, 5)
+
+    def check_unary(self, pyfunc, arr):
+        out = np.zeros(1)
+        aryty = typeof(arr)
+        cfunc = self.get_cfunc(pyfunc, (aryty, typeof(out)))
+        cout = np.zeros(1)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out[0], cout[0])
+        # Retry with forced any layout
+        cfunc = self.get_cfunc(pyfunc, (aryty.copy(layout="A"), typeof(out)))
+        cout = np.zeros(1)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(cout[0], out[0])
+
+    def check_unary_with_arrays(
+        self,
+        pyfunc,
+    ):
+        self.check_unary(pyfunc, self.a)
+        self.check_unary(pyfunc, self.a.T)
+        # 0-d array
+        arr = np.array([42]).reshape(())
+        self.check_unary(pyfunc, arr)
+        # array with an empty dimension
+        arr = np.zeros(0)
+        self.check_unary(pyfunc, arr)
+
+        # check with reshape
+        self.check_unary(pyfunc, arr.reshape((1, 0, 2)))
+
+    def get_cfunc(self, pyfunc, argspec):
+        return jit(argspec)(pyfunc)
+
+    def test_shape(self):
+        pyfunc = array_shape
+        cfunc = self.get_cfunc(
+            pyfunc, (types.int32[:, :], types.int32, types.float64[:])
+        )
+
+        for i in range(self.a.ndim):
+            out = np.zeros(1)
+            cout = np.zeros(1)
+            pyfunc(self.a, i, out)
+            cfunc[1, 1](self.a, i, cout)
+            self.assertEqual(out[0], cout[0])
+
+    def test_strides(self):
+        pyfunc = array_strides
+        cfunc = self.get_cfunc(
+            pyfunc, (types.int32[:, :], types.int32, types.float64[:])
+        )
+
+        for i in range(self.a.ndim):
+            out = np.zeros(1)
+            cout = np.zeros(1)
+            pyfunc(self.a, i, out)
+            cfunc[1, 1](self.a, i, cout)
+            self.assertEqual(out[0], cout[0])
+
+    def test_ndim(self):
+        self.check_unary_with_arrays(array_ndim)
+
+    def test_size(self):
+        self.check_unary_with_arrays(array_size)
+
+    def test_itemsize(self):
+        self.check_unary_with_arrays(array_itemsize)
+
+    def test_nbytes(self):
+        self.check_unary_with_arrays(array_nbytes)
+
+    def test_flags_contiguous(self):
+        with override_config("CUDA_ENABLE_NRT", True):
+            self.check_unary_with_arrays(array_flags_contiguous)
+
+    def test_flags_c_contiguous(self):
+        with override_config("CUDA_ENABLE_NRT", True):
+            self.check_unary_with_arrays(array_flags_c_contiguous)
+
+    def test_flags_f_contiguous(self):
+        with override_config("CUDA_ENABLE_NRT", True):
+            self.check_unary_with_arrays(array_flags_f_contiguous)
+
+
+class TestNestedArrayAttr(MemoryLeakMixin, CUDATestCase):
+    def setUp(self):
+        super(TestNestedArrayAttr, self).setUp()
+        dtype = np.dtype([("a", np.int32), ("f", np.int32, (2, 5))])
+        self.a = np.recarray(1, dtype)[0]
+        self.nbrecord = from_dtype(self.a.dtype)
+
+    def get_cfunc(self, pyfunc):
+        return jit((self.nbrecord, types.float64[:]))(pyfunc)
+
+    def test_shape(self):
+        pyfunc = nested_array_shape
+        cfunc = self.get_cfunc(pyfunc)
+
+        out = np.zeros(2)
+        cout = np.zeros(2)
+        pyfunc(self.a, out)
+        cfunc[1, 1](self.a, cout)
+        self.assertEqual(out[0], cout[0])
+        self.assertEqual(out[1], cout[1])
+
+    def test_strides(self):
+        pyfunc = nested_array_strides
+        cfunc = self.get_cfunc(pyfunc)
+
+        out = np.zeros(2)
+        cout = np.zeros(2)
+        pyfunc(self.a, out)
+        cfunc[1, 1](self.a, cout)
+        self.assertEqual(out[0], cout[0])
+        self.assertEqual(out[1], cout[1])
+
+    def test_ndim(self):
+        pyfunc = nested_array_ndim
+        cfunc = self.get_cfunc(pyfunc)
+
+        out = np.zeros(1)
+        cout = np.zeros(1)
+        pyfunc(self.a, out)
+        cfunc[1, 1](self.a, cout)
+        self.assertEqual(out[0], cout[0])
+
+    def test_nbytes(self):
+        pyfunc = nested_array_nbytes
+        cfunc = self.get_cfunc(pyfunc)
+
+        out = np.zeros(1)
+        cout = np.zeros(1)
+        pyfunc(self.a, out)
+        cfunc[1, 1](self.a, cout)
+        self.assertEqual(out[0], cout[0])
+
+    def test_size(self):
+        pyfunc = nested_array_size
+        cfunc = self.get_cfunc(pyfunc)
+
+        out = np.zeros(1)
+        cout = np.zeros(1)
+        pyfunc(self.a, out)
+        cfunc[1, 1](self.a, cout)
+        self.assertEqual(out[0], cout[0])
+
+    def test_itemsize(self):
+        pyfunc = nested_array_itemsize
+        cfunc = self.get_cfunc(pyfunc)
+
+        out = np.zeros(1)
+        cout = np.zeros(1)
+        pyfunc(self.a, out)
+        cfunc[1, 1](self.a, cout)
+        self.assertEqual(out[0], cout[0])
+
+
+class TestSlicedArrayAttr(MemoryLeakMixin, CUDATestCase):
+    def test_size_after_slicing(self):
+        pyfunc = size_after_slicing_usecase
+        cfunc = jit(pyfunc)
+        arr = np.arange(2 * 5).reshape(2, 5)
+        for i in range(arr.shape[0]):
+            out = np.zeros(1)
+            cout = np.zeros(1)
+            pyfunc(arr, i, out)
+            cfunc[1, 1](arr, i, cout)
+            self.assertEqual(out[0], cout[0])
+        arr = np.arange(2 * 5 * 3).reshape(2, 5, 3)
+        for i in range(arr.shape[0]):
+            out = np.zeros(1)
+            cout = np.zeros(1)
+            pyfunc(arr, i, out)
+            cfunc[1, 1](arr, i, cout)
+            self.assertEqual(out[0], cout[0])
+
+
+class TestRealImagAttr(MemoryLeakMixin, CUDATestCase):
+    def setUp(self):
+        override_config("CUDA_ENABLE_NRT", True)
+        super(TestRealImagAttr, self).setUp()
+
+    def check_complex(self, pyfunc):
+        cfunc = jit(pyfunc)
+        # test 1D
+        size = 10
+        arr = np.arange(size) + np.arange(size) * 10j
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out, cout)
+        # test 2D
+        arr = arr.reshape(2, 5)
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out, cout)
+
+    def test_complex_real(self):
+        self.check_complex(array_real)
+
+    def test_complex_imag(self):
+        self.check_complex(array_imag)
+
+    def check_number_real(self, dtype):
+        pyfunc = array_real
+        cfunc = jit(pyfunc)
+        # test 1D
+        size = 10
+        arr = np.arange(size, dtype=dtype)
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out, cout)
+        # test 2D
+        arr = arr.reshape(2, 5)
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out, cout)
+        # test identity
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertEqual(arr.data, out.data)
+        self.assertEqual(arr.data, cout.data)
+        # test writable
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        cfunc[1, 1](arr, cout)
+        self.assertNotEqual(cout[0, 0], 5)
+        cout[0, 0] = 5
+        self.assertEqual(cout[0, 0], 5)
+
+    def test_number_real(self):
+        """
+        Testing .real of non-complex dtypes
+        """
+        for dtype in [np.uint8, np.int32, np.float32, np.float64]:
+            self.check_number_real(dtype)
+
+    def check_number_imag(self, dtype):
+        pyfunc = array_imag
+        cfunc = jit(pyfunc)
+        # test 1D
+        size = 10
+        arr = np.arange(size, dtype=dtype)
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out, cout)
+        # test 2D
+        arr = arr.reshape(2, 5)
+        out = np.zeros(arr.shape)
+        cout = np.zeros(arr.shape)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out, cout)
+        # test are zeros
+        cout = np.zeros(arr.shape)
+        cfunc[1, 1](arr, cout)
+        self.assertEqual(cout.tolist(), np.zeros_like(arr).tolist())
+
+    def test_number_imag(self):
+        """
+        Testing .imag of non-complex dtypes
+        """
+        with override_config("CUDA_ENABLE_NRT", True):
+            for dtype in [np.uint8, np.int32, np.float32, np.float64]:
+                self.check_number_imag(dtype)
+
+    def test_record_real(self):
+        rectyp = np.dtype([("real", np.float32), ("imag", np.complex64)])
+        arr = np.zeros(3, dtype=rectyp)
+        arr["real"] = np.random.random(arr.size)
+        arr["imag"] = np.random.random(arr.size) * 1.3j
+
+        # check numpy behavior
+        # .real is identity
+        out = np.zeros(arr.shape, dtype=arr.dtype)
+        array_real(arr, out)
+        self.assertPreciseEqual(out, arr)
+        # .imag is zero_like
+        out = np.zeros(arr.shape, dtype=arr.dtype)
+        array_imag(arr, out)
+        self.assertEqual(out.tolist(), np.zeros_like(arr).tolist())
+
+        # check numba behavior
+        # it's most likely a user error, anyway
+        jit_array_real = jit(array_real)
+        jit_array_imag = jit(array_imag)
+
+        cout = np.zeros(arr.shape, dtype=arr.dtype)
+        with self.assertRaises(TypingError) as raises:
+            jit_array_real[1, 1](arr, cout)
+        self.assertIn(
+            "cannot access .real of array of Record", str(raises.exception)
+        )
+
+        cout = np.zeros(arr.shape, dtype=arr.dtype)
+        with self.assertRaises(TypingError) as raises:
+            jit_array_imag[1, 1](arr, cout)
+        self.assertIn(
+            "cannot access .imag of array of Record", str(raises.exception)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/test_array_constants.py b/numba_cuda/numba/cuda/tests/test_array_constants.py
new file mode 100644
index 000000000..e5a927e52
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_array_constants.py
@@ -0,0 +1,275 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+import numpy as np
+
+import unittest
+from numba.cuda import jit, grid, to_device, device_array
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim
+
+
+a0 = np.array(42)
+
+s1 = np.int32(64)
+
+a1 = np.arange(12)
+a2 = a1[::2]
+a3 = a1.reshape((3, 4)).T
+
+dt = np.dtype([("x", np.int8), ("y", "S3")])
+
+a4 = np.arange(32, dtype=np.int8).view(dt)
+a5 = a4[::-2]
+
+# A recognizable data string
+a6 = np.frombuffer(b"XXXX_array_contents_XXXX", dtype=np.float32)
+
+
+myarray = np.array(
+    [
+        1,
+    ]
+)
+
+
+@jit
+def getitem0_kernel(input_array, output_array, size):
+    i = grid(1)
+    if i < size:
+        output_array[i] = input_array[()]
+
+
+@jit
+def getitem1_kernel(input_array, output_array, size):
+    i = grid(1)
+    if i < size:
+        output_array[i] = input_array[i]
+
+
+@jit
+def getitem2_kernel(input_array, output_array, size):
+    i = grid(1)
+    if i < size:
+        output_array[i] = input_array[i]
+
+
+@jit
+def getitem3_kernel(input_array, output_array, size):
+    i = grid(1)
+    if i < size:
+        # For 2D arrays, flatten the indexing
+        if input_array.ndim > 1:
+            # Calculate 2D indices from flat index
+            flat_idx = i
+            if flat_idx < input_array.size:
+                row = flat_idx // input_array.shape[1]
+                col = flat_idx % input_array.shape[1]
+                output_array[i] = input_array[row, col]
+        else:
+            output_array[i] = input_array[i]
+
+
+@jit
+def getitem4_kernel(input_array, output_array, size):
+    i = grid(1)
+    if i < size:
+        output_array[i] = input_array[i]
+
+
+@jit
+def getitem5_kernel(input_array, output_array, size):
+    i = grid(1)
+    if i < size:
+        output_array[i] = input_array[i]
+
+
+@jit
+def getitem6_kernel(input_array, output_array, size):
+    i = grid(1)
+    if i < size:
+        output_array[i] = input_array[i]
+
+
+@jit
+def use_arrayscalar_const_kernel(output_array, size):
+    i = grid(1)
+    if i < size:
+        output_array[i] = s1
+
+
+@jit
+def write_to_global_array_kernel(global_array):
+    i = grid(1)
+    if i < 1:
+        global_array[0] = 1
+
+
+@jit
+def bytes_as_const_array_kernel(output_array, size):
+    i = grid(1)
+    if i < size:
+        # Use hardcoded bytes values instead of frombuffer
+        # "foo" as uint8 values: f=102, o=111, o=111
+        if i % 3 == 0:
+            output_array[i] = 102  # 'f'
+        elif i % 3 == 1:
+            output_array[i] = 111  # 'o'
+        else:
+            output_array[i] = 111  # 'o'
+
+
+@skip_on_cudasim
+class TestConstantArray(CUDATestCase):
+    """
+    Test array constants.
+    """
+
+    def check_array_const(self, kernel_func, input_array, expected_size):
+        # Convert input array to device (make contiguous if needed)
+        if (
+            not input_array.flags["C_CONTIGUOUS"]
+            and not input_array.flags["F_CONTIGUOUS"]
+        ):
+            input_array = np.ascontiguousarray(input_array)
+        d_input = to_device(input_array)
+        d_output = device_array(expected_size, dtype=input_array.dtype)
+
+        # Launch kernel
+        kernel_func[1, expected_size](d_input, d_output, expected_size)
+
+        # Get result
+        result = d_output.copy_to_host()
+
+        # Verify result matches expected values
+        for i in range(expected_size):
+            if input_array.ndim == 0:
+                # For 0D arrays, all results should be the same
+                expected_val = input_array[()]
+                np.testing.assert_array_equal(result[i], expected_val)
+            elif i < input_array.size:
+                if input_array.ndim == 1:
+                    expected_val = input_array[i]
+                else:
+                    # For multi-dimensional arrays, flatten the indexing
+                    flat_idx = i
+                    if flat_idx < input_array.size:
+                        expected_val = input_array.flat[flat_idx]
+                    else:
+                        continue
+                np.testing.assert_array_equal(result[i], expected_val)
+
+    def test_array_const_0d(self):
+        self.check_array_const(getitem0_kernel, a0, 3)
+
+    def test_array_const_1d_contig(self):
+        self.check_array_const(getitem1_kernel, a1, 3)
+
+    def test_array_const_1d_noncontig(self):
+        self.check_array_const(getitem2_kernel, a2, 3)
+
+    def test_array_const_2d(self):
+        self.check_array_const(getitem3_kernel, a3, 3)
+
+    def test_record_array_const_contig(self):
+        self.check_array_const(getitem4_kernel, a4, 3)
+
+    def test_record_array_const_noncontig(self):
+        self.check_array_const(getitem5_kernel, a5, 3)
+
+    def test_array_const_alignment(self):
+        """
+        Issue #1933: the array declaration in the LLVM IR must have
+        the right alignment specified.
+        """
+        # Test the kernel with the alignment array
+        self.check_array_const(getitem6_kernel, a6, 3)
+
+    def test_arrayscalar_const(self):
+        # Test arrayscalar constant in CUDA kernel
+        d_output = device_array(1, dtype=np.int32)
+        use_arrayscalar_const_kernel[1, 1](d_output, 1)
+        result = d_output.copy_to_host()
+        self.assertEqual(result[0], s1)
+
+    def test_write_to_global_array(self):
+        # Test that writing to global array works in CUDA
+        d_myarray = to_device(myarray.copy())  # Make a writable copy
+        write_to_global_array_kernel[1, 1](d_myarray)
+        # Copy back to host and verify the global array was modified
+        result = d_myarray.copy_to_host()
+        self.assertEqual(result[0], 1)
+
+    def test_issue_1850(self):
+        """
+        This issue is caused by an unresolved bug in numpy since version 1.6.
+        See numpy GH issue #3147.
+        """
+        constarr = np.array([86])
+
+        @jit
+        def issue_1850_kernel(output_array, size):
+            i = grid(1)
+            if i < size:
+                output_array[i] = constarr[0]
+
+        d_output = device_array(1, dtype=np.int32)
+        issue_1850_kernel[1, 1](d_output, 1)
+        out = d_output.copy_to_host()
+        self.assertEqual(out[0], 86)
+
+    def test_too_big_to_freeze(self):
+        """
+        Test issue https://github.com/numba/numba/issues/2188 where freezing
+        a constant array into the code that's prohibitively long and consumes
+        too much RAM.
+        """
+        nelem = 10**4  # Reduced size for CUDA testing
+
+        @jit
+        def big_array_kernel(input_array, output_array, size):
+            i = grid(1)
+            if i < size:
+                if input_array.ndim == 1:
+                    output_array[i] = input_array[i]
+                else:
+                    # For multi-dimensional arrays, flatten the indexing
+                    flat_idx = i
+                    if flat_idx < input_array.size:
+                        output_array[i] = input_array.flat[flat_idx]
+
+        c_array = np.arange(nelem).reshape(nelem)
+        f_array = np.asfortranarray(np.random.random((2, nelem // 2)))
+
+        # Test C contig
+        d_input = to_device(c_array)
+        d_output = device_array(nelem, dtype=c_array.dtype)
+        # Use proper block size for CUDA (max 1024 threads per block)
+        block_size = min(nelem, 1024)
+        grid_size = (nelem + block_size - 1) // block_size
+        big_array_kernel[grid_size, block_size](d_input, d_output, nelem)
+        result = d_output.copy_to_host()
+        np.testing.assert_array_equal(c_array, result)
+
+        # Test F contig
+        d_input = to_device(f_array)
+        d_output = device_array(f_array.size, dtype=f_array.dtype)
+        block_size = min(f_array.size, 1024)
+        grid_size = (f_array.size + block_size - 1) // block_size
+        big_array_kernel[grid_size, block_size](d_input, d_output, f_array.size)
+        result = d_output.copy_to_host()
+        np.testing.assert_array_equal(f_array.flatten(), result)
+
+
+@skip_on_cudasim
+class TestConstantBytes(CUDATestCase):
+    def test_constant_bytes(self):
+        # Test constant bytes array in CUDA kernel
+        d_output = device_array(3, dtype=np.uint8)
+        bytes_as_const_array_kernel[1, 3](d_output, 3)
+        result = d_output.copy_to_host()
+        expected = np.frombuffer(b"foo", dtype=np.uint8)
+        np.testing.assert_array_equal(result, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/test_array_iterators.py b/numba_cuda/numba/cuda/tests/test_array_iterators.py
new file mode 100644
index 000000000..0cd9c8d12
--- /dev/null
+++ b/numba_cuda/numba/cuda/tests/test_array_iterators.py
@@ -0,0 +1,638 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+
+import numpy as np
+
+from numba.cuda import jit
+from numba import typeof
+from numba.core import types
+from numba.cuda.tests.support import MemoryLeakMixin, override_config
+from numba.cuda.testing import CUDATestCase
+import unittest
+import itertools
+
+
+def array_iter(arr, out):
+    total = 0
+    for i, v in enumerate(arr):
+        total += i * v
+    out[0] = total
+
+
+def array_view_iter(arr, idx, out):
+    total = 0
+    for i, v in enumerate(arr[idx]):
+        total += i * v
+    out[0] = total
+
+
+def array_flat(arr, out):
+    for i, v in enumerate(arr.flat):
+        out[i] = v
+
+
+def array_flat_getitem(arr, ind, out):
+    out[0] = arr.flat[ind]
+
+
+def array_flat_setitem(arr, ind, val):
+    arr.flat[ind] = val
+
+
+def array_flat_sum(arr, out):
+    s = 0
+    for i, v in enumerate(arr.flat):
+        s = s + (i + 1) * v
+    out[0] = s
+
+
+def array_flat_len(arr, out):
+    out[0] = len(arr.flat)
+
+
+def array_ndenumerate_sum(arr, out):
+    s = 0
+    for (i, j), v in np.ndenumerate(arr):
+        s = s + (i + 1) * (j + 1) * v
+    out[0] = s
+
+
+def np_ndindex_empty(out):
+    s = 0
+    for ind in np.ndindex(()):
+        s += s + len(ind) + 1
+    out[0] = s
+
+
+def np_ndindex(x, y, out):
+    s = 0
+    n = 0  # noqa: F841
+    for i, j in np.ndindex(x, y):
+        s = s + (i + 1) * (j + 1)
+    out[0] = s
+
+
+def np_ndindex_array(arr, out):
+    s = 0
+    n = 0  # noqa: F841
+    for indices in np.ndindex(arr.shape):
+        for i, j in enumerate(indices):
+            s = s + (i + 1) * (j + 1)
+    out[0] = s
+
+
+def np_nditer1a(a, out):
+    for u in np.nditer(a):
+        out = u.item()  # noqa: F841
+
+
+def np_nditer1b(a, out):
+    i = 0
+    for u in np.nditer(a):
+        out[i] = u.item()
+        i += 1
+
+
+def np_nditer2a(a, b, out):
+    for u, v in np.nditer((a, b)):
+        out[0] = u.item()
+        out[1] = v.item()
+
+
+def np_nditer2b(a, b, out):
+    i = 0
+    for u, v in np.nditer((a, b)):
+        out[i] = u.item()
+        out[i + 1] = v.item()
+        i += 2
+
+
+def np_nditer2b_err(a, b, out):
+    i = 0
+    for u, v in np.nditer((a, b)):
+        out[i] = u.item()
+        out[i + 1] = v.item()
+        i += 2
+
+
+def np_nditer3(a, b, c, out):
+    i = 0
+    for u, v, w in np.nditer((a, b, c)):
+        out[i] = u.item()
+        out[i + 1] = v.item()
+        out[i + 2] = w.item()
+        i += 3
+
+
+def iter_next(arr, out):
+    it = iter(arr)
+    it2 = iter(arr)
+    out[0] = next(it)
+    out[1] = next(it)
+    out[2] = next(it2)
+
+
+#
+# Test premature free (see issue #2112).
+# The following test allocates an array ``x`` inside the body.
+# The compiler will put a ``del x`` right after the last use of ``x``,
+# which is right after the creation of the array iterator and
+# before the loop is entered.  If the iterator does not incref the array,
+# the iterator will be reading garbage data of free'ed memory.
+#
+
+
+def array_flat_premature_free(size, out):
+    x = np.arange(size)
+    res = np.zeros_like(x, dtype=np.intp)
+    for i, v in enumerate(x.flat):
+        res[i] = v
+    for i in range(len(res)):
+        out[i] = res[i]
+
+
+def array_ndenumerate_premature_free(size, out):
+    x = np.arange(size)
+    res = np.zeros_like(x, dtype=np.intp)
+    for i, v in np.ndenumerate(x):
+        res[i] = v
+    for i in range(len(res)):
+        out[i] = res[i]
+
+
+class TestArrayIterators(MemoryLeakMixin, CUDATestCase):
+    """
+    Test array.flat,  etc.
+    """
+
+    def setUp(self):
+        super(TestArrayIterators, self).setUp()
+
+    def check_array_iter_1d(self, arr):
+        out = np.zeros(1, dtype=np.int32)
+        cout = np.zeros(1, dtype=np.int32)
+        pyfunc = array_iter
+        cfunc = jit((typeof(arr), typeof(out)))(pyfunc)
+        pyfunc(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out[0], cout[0])
+
+    def check_array_view_iter(self, arr, index):
+        out = np.zeros(1)
+        cout = np.zeros(1)
+        pyfunc = array_view_iter
+        cfunc = jit(
+            (
+                typeof(arr),
+                typeof(index),
+                typeof(out),
+            )
+        )(pyfunc)
+        pyfunc(arr, index, out)
+        cfunc[1, 1](arr, index, cout)
+        self.assertPreciseEqual(out[0], cout[0])
+
+    def check_array_flat(self, arr, arrty=None):
+        out = np.zeros(arr.size, dtype=arr.dtype)
+        nb_out = out.copy()
+        if arrty is None:
+            arrty = typeof(arr)
+
+        cfunc = jit(
+            (
+                arrty,
+                typeof(out),
+            )
+        )(array_flat)
+
+        array_flat(arr, out)
+        cfunc[1, 1](arr, nb_out)
+
+        self.assertPreciseEqual(out, nb_out)
+
+    def check_array_unary(self, arr, arrty, func):
+        out = np.zeros(3)
+        cout = np.zeros(3)
+        cfunc = jit((arrty, typeof(out)))(func)
+        func(arr, out)
+        cfunc[1, 1](arr, cout)
+        self.assertPreciseEqual(out, cout)
+
+    def check_array_ndenumerate_sum(self, arr, arrty):
+        self.check_array_unary(arr, arrty, array_ndenumerate_sum)
+
+    def test_array_iter(self):
+        # Test iterating over arrays
+        arr = np.arange(6)
+        self.check_array_iter_1d(arr)
+        arr = arr[::2]
+        self.assertFalse(arr.flags.c_contiguous)
+        self.assertFalse(arr.flags.f_contiguous)
+        self.check_array_iter_1d(np.ascontiguousarray(arr))
+        arr = np.bool_([1, 0, 0, 1])
+        self.check_array_iter_1d(arr)
+
+    def test_array_view_iter(self):
+        # Test iterating over a 1d view over a 2d array
+        arr = np.arange(12).reshape((3, 4))
+        self.check_array_view_iter(arr, 1)
+        self.check_array_view_iter(arr.T, 1)
+        arr = arr[::2]
+        self.check_array_view_iter(np.ascontiguousarray(arr), 1)
+        arr = np.bool_([1, 0, 0, 1]).reshape((2, 2))
+        self.check_array_view_iter(arr, 1)
+
+    def test_array_flat_3d(self):
+        arr = np.arange(24).reshape(4, 2, 3)
+
+        arrty = typeof(arr)
+        self.assertEqual(arrty.ndim, 3)
+        self.assertEqual(arrty.layout, "C")
+        self.assertTrue(arr.flags.c_contiguous)
+        # Test with C-contiguous array
+        self.check_array_flat(arr)
+        # Test with Fortran-contiguous array
+        arr = arr.transpose()
+        self.assertFalse(arr.flags.c_contiguous)
+        self.assertTrue(arr.flags.f_contiguous)
+        self.assertEqual(typeof(arr).layout, "F")
+        self.check_array_flat(arr)
+        # Test with non-contiguous array
+        arr = arr[::2]
+        self.assertFalse(arr.flags.c_contiguous)
+        self.assertFalse(arr.flags.f_contiguous)
+        self.assertEqual(typeof(arr).layout, "A")
+        self.check_array_flat(np.ascontiguousarray(arr))
+        # Boolean array
+        arr = np.bool_([1, 0, 0, 1] * 2).reshape((2, 2, 2))
+        self.check_array_flat(np.ascontiguousarray(arr))
+
+    def test_array_flat_empty(self):
+        # Test .flat with various shapes of empty arrays, contiguous
+        # and non-contiguous (see issue #846).
+
+        # Define a local checking function, Numba's `typeof` ends up aliasing
+        # 0d C and F ordered arrays, so the check needs to go via the compile
+        # result entry point to bypass type checking.
+        def check(arr, arrty):
+            out = np.zeros(1, dtype=np.int32)
+            cout = np.zeros(1, dtype=np.int32)
+            cfunc = jit((arrty, typeof(out)))(array_flat_sum)
+            array_flat_sum(arr, out)
+            cfunc[1, 1](arr, cout)
+            self.assertPreciseEqual(out[0], cout[0])
+
+        arr = np.zeros(0, dtype=np.int32)
+        arr = arr.reshape(0, 2)
+        arrty = types.Array(types.int32, 2, layout="C")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="F")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="A")
+        check(arr, arrty)
+        arr = arr.reshape(2, 0)
+        arrty = types.Array(types.int32, 2, layout="C")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="F")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="A")
+        check(arr, arrty)
+
+    def test_array_flat_getitem(self):
+        # Test indexing of array.flat object
+        pyfunc = array_flat_getitem
+        cfunc = jit(pyfunc)
+
+        def check(arr, ind):
+            out = np.zeros(1, dtype=np.int32)
+            cout = np.zeros(1, dtype=np.int32)
+            pyfunc(arr, ind, out)
+            cfunc[1, 1](arr, ind, cout)
+            self.assertEqual(cout[0], out[0])
+
+        arr = np.arange(24).reshape(4, 2, 3)
+        for i in range(arr.size):
+            check(arr, i)
+        arr = arr.T
+        for i in range(arr.size):
+            check(arr, i)
+        arr = arr[::2]
+        for i in range(arr.size):
+            check(np.ascontiguousarray(arr), i)
+        arr = np.array([42]).reshape(())
+        for i in range(arr.size):
+            check(arr, i)
+        # Boolean array
+        arr = np.bool_([1, 0, 0, 1])
+        for i in range(arr.size):
+            check(arr, i)
+        arr = arr[::2]
+        for i in range(arr.size):
+            check(np.ascontiguousarray(arr), i)
+
+    def test_array_flat_setitem(self):
+        # Test indexing of array.flat object
+        pyfunc = array_flat_setitem
+        cfunc = jit(pyfunc)
+
+        def check(arr, ind):
+            # Use np.copy() to keep the layout
+            expected = np.copy(arr)
+            got = np.copy(arr)
+            pyfunc(expected, ind, 123)
+            cfunc[1, 1](got, ind, 123)
+            self.assertPreciseEqual(got, expected)
+
+        arr = np.arange(24).reshape(4, 2, 3)
+        for i in range(arr.size):
+            check(arr, i)
+        arr = arr.T
+        for i in range(arr.size):
+            check(arr, i)
+        arr = arr[::2]
+        for i in range(arr.size):
+            check(np.ascontiguousarray(arr), i)
+        arr = np.array([42]).reshape(())
+        for i in range(arr.size):
+            check(arr, i)
+        # Boolean array
+        arr = np.bool_([1, 0, 0, 1])
+        for i in range(arr.size):
+            check(arr, i)
+        arr = arr[::2]
+        for i in range(arr.size):
+            check(arr, i)
+
+    def test_array_flat_len(self):
+        # Test len(array.flat)
+        pyfunc = array_flat_len
+        cfunc = jit(array_flat_len)
+
+        def check(arr):
+            out = np.zeros(1, dtype=np.int32)
+            cout = np.zeros(1, dtype=np.int32)
+            pyfunc(arr, out)
+            cfunc[1, 1](arr, cout)
+            self.assertEqual(cout[0], out[0])
+
+        arr = np.arange(24).reshape(4, 2, 3)
+        check(arr)
+        arr = arr.T
+        check(arr)
+        arr = np.array([42]).reshape(())
+        check(arr)
+
+    def test_array_flat_premature_free(self):
+        with override_config("CUDA_ENABLE_NRT", True):
+            out = np.zeros(6)
+            cout = np.zeros(6)
+            cfunc = jit((types.intp, typeof(out)))(array_flat_premature_free)
+            array_flat_premature_free(6, out)
+            cfunc[1, 1](6, cout)
+            self.assertTrue(cout.sum())
+            self.assertPreciseEqual(out, cout)
+
+    def test_array_ndenumerate_2d(self):
+        arr = np.arange(12).reshape(4, 3)
+        arrty = typeof(arr)
+        self.assertEqual(arrty.ndim, 2)
+        self.assertEqual(arrty.layout, "C")
+        self.assertTrue(arr.flags.c_contiguous)
+        # Test with C-contiguous array
+        self.check_array_ndenumerate_sum(arr, arrty)
+        # Test with Fortran-contiguous array
+        arr = arr.transpose()
+        self.assertFalse(arr.flags.c_contiguous)
+        self.assertTrue(arr.flags.f_contiguous)
+        arrty = typeof(arr)
+        self.assertEqual(arrty.layout, "F")
+        self.check_array_ndenumerate_sum(arr, arrty)
+        # Test with non-contiguous array
+        arr = arr[::2]
+        self.assertFalse(arr.flags.c_contiguous)
+        self.assertFalse(arr.flags.f_contiguous)
+        arrty = typeof(arr)
+        self.assertEqual(arrty.layout, "A")
+        self.check_array_ndenumerate_sum(np.ascontiguousarray(arr), arrty)
+        # Boolean array
+        arr = np.bool_([1, 0, 0, 1]).reshape((2, 2))
+        self.check_array_ndenumerate_sum(np.ascontiguousarray(arr), typeof(arr))
+
+    def test_array_ndenumerate_empty(self):
+        # Define a local checking function, Numba's `typeof` ends up aliasing
+        # 0d C and F ordered arrays, so the check needs to go via the compile
+        # result entry point to bypass type checking.
+        def check(arr, arrty):
+            out = np.zeros(1, dtype=np.int32)
+            cout = np.zeros(1, dtype=np.int32)
+            cfunc = jit((arrty, typeof(out)))(array_ndenumerate_sum)
+            array_ndenumerate_sum(arr, out)
+            cfunc[1, 1](arr, cout)
+            np.testing.assert_allclose(out[0], cout[0])
+
+        arr = np.zeros(0, dtype=np.int32)
+        arr = arr.reshape(0, 2)
+        arrty = types.Array(types.int32, 2, layout="C")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="F")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="A")
+        check(arr, arrty)
+        arr = arr.reshape(2, 0)
+        arrty = types.Array(types.int32, 2, layout="C")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="F")
+        check(arr, arrty)
+        arrty = types.Array(types.int32, 2, layout="A")
+        check(arr, arrty)
+
+    def test_array_ndenumerate_premature_free(self):
+        with override_config("CUDA_ENABLE_NRT", True):
+            out = np.zeros(6)
+            cout = np.zeros(6)
+            cfunc = jit((types.intp, typeof(out)))(
+                array_ndenumerate_premature_free
+            )
+            array_ndenumerate_premature_free(6, out)
+            cfunc[1, 1](6, cout)
+            self.assertTrue(cout.sum())
+            self.assertPreciseEqual(out, cout)
+
+    def test_np_ndindex(self):
+        func = np_ndindex
+        out = np.zeros(1)
+        cout = np.zeros(1)
+        cfunc = jit(
+            (
+                types.int32,
+                types.int32,
+                typeof(out),
+            )
+        )(func)
+        func(3, 4, out)
+        cfunc[1, 1](3, 4, cout)
+        self.assertPreciseEqual(out, cout)
+        func(3, 0, out)
+        cfunc[1, 1](3, 0, cout)
+        self.assertPreciseEqual(out, cout)
+        func(0, 3, out)
+        cfunc[1, 1](0, 3, cout)
+        self.assertPreciseEqual(out, cout)
+        func(0, 0, out)
+        cfunc[1, 1](0, 0, cout)
+        self.assertPreciseEqual(out, cout)
+
+    def test_np_ndindex_array(self):
+        func = np_ndindex_array
+        arr = np.arange(12, dtype=np.int32) + 10
+        self.check_array_unary(arr, typeof(arr), func)
+        arr = arr.reshape((4, 3))
+        self.check_array_unary(arr, typeof(arr), func)
+        arr = arr.reshape((2, 2, 3))
+        self.check_array_unary(arr, typeof(arr), func)
+
+    def test_iter_next(self):
+        # This also checks memory management with iter() and next()
+        func = iter_next
+        arr = np.arange(12, dtype=np.int32) + 10
+        self.check_array_unary(arr, typeof(arr), func)
+
+
+class TestNdIter(MemoryLeakMixin, CUDATestCase):
+    """
+    Test np.nditer()
+    """
+
+    def inputs_a(self):
+        # scalars
+        yield np.float32(100)
+
+        # 0-d arrays
+        yield np.array(102, dtype=np.int16)
+
+    def inputs_b(self):
+        # All those inputs are compatible with a (3, 4) main shape
+        # 1-d arrays
+        yield np.arange(4).astype(np.complex64)
+        yield np.arange(8)[::2]
+
+        # 2-d arrays
+        a = np.arange(12).reshape((3, 4))
+        yield a
+        yield a.copy(order="F")
+        a = np.arange(24).reshape((6, 4))[::2]
+        yield a
+
+    def basic_inputs(self):
+        yield np.arange(4).astype(np.complex64)
+        yield np.arange(8)[::2]
+        a = np.arange(12).reshape((3, 4))
+        yield a
+        yield a.copy(order="F")
+
+    def check_result(self, got, expected):
+        self.assertEqual(set(got), set(expected), (got, expected))
+
+    def test_nditer1a(self):
+        pyfunc = np_nditer1a
+        cfunc = jit(pyfunc)
+        for a in self.inputs_a():
+            out = np.zeros(a.size, dtype=a.dtype)
+            cout = np.zeros(a.size, dtype=a.dtype)
+            pyfunc(a, out)
+            cfunc[1, 1](a, cout)
+            self.assertPreciseEqual(out, cout)
+
+    def test_nditer1b(self):
+        pyfunc = np_nditer1b
+        cfunc = jit(pyfunc)
+        for a in self.inputs_b():
+            out = np.zeros(a.size, dtype=a.dtype)
+            cout = np.zeros(a.size, dtype=a.dtype)
+            pyfunc(np.ascontiguousarray(a), out)
+            cfunc[1, 1](np.ascontiguousarray(a), cout)
+            self.assertPreciseEqual(out, cout)
+
+    def test_nditer2a(self):
+        pyfunc = np_nditer2a
+        cfunc = jit(pyfunc)
+        for a, b in itertools.product(self.inputs_a(), self.inputs_a()):
+            out = np.zeros(
+                a.size + b.size, dtype=np.result_type(a.dtype, b.dtype)
+            )
+            cout = np.zeros(
+                a.size + b.size, dtype=np.result_type(a.dtype, b.dtype)
+            )
+            pyfunc(a, b, out)
+            cfunc[1, 1](a, b, cout)
+            self.check_result(out, cout)
+
+    def test_nditer2b(self):
+        pyfunc = np_nditer2b
+        cfunc = jit(pyfunc)
+        for a, b in itertools.product(self.inputs_b(), self.inputs_b()):
+            out = np.zeros(
+                a.size * b.size, dtype=np.result_type(a.dtype, b.dtype)
+            )
+            cout = np.zeros(
+                a.size * b.size, dtype=np.result_type(a.dtype, b.dtype)
+            )
+            pyfunc(np.ascontiguousarray(a), np.ascontiguousarray(b), out)
+            cfunc[1, 1](np.ascontiguousarray(a), np.ascontiguousarray(b), cout)
+            self.check_result(out, cout)
+
+    def test_nditer3(self):
+        pyfunc = np_nditer3
+        cfunc = jit(pyfunc)
+        # Use a restricted set of inputs, to shorten test time
+        inputs = self.basic_inputs
+        for a, b, c in itertools.product(inputs(), inputs(), inputs()):
+            out = np.zeros(
+                a.size * b.size * c.size,
+                dtype=np.result_type(a.dtype, b.dtype, c.dtype),
+            )
+            cout = np.zeros(
+                a.size * b.size * c.size,
+                dtype=np.result_type(a.dtype, b.dtype, c.dtype),
+            )
+            pyfunc(
+                np.ascontiguousarray(a),
+                np.ascontiguousarray(b),
+                np.ascontiguousarray(c),
+                out,
+            )
+            cfunc[1, 1](
+                np.ascontiguousarray(a),
+                np.ascontiguousarray(b),
+                np.ascontiguousarray(c),
+                cout,
+            )
+            self.check_result(out, cout)
+
+    def test_errors(self):
+        # Incompatible shapes
+        pyfunc = np_nditer2b_err
+        cfunc = jit(debug=True)(pyfunc)
+
+        self.disable_leak_check()
+
+        def check_incompatible(a, b):
+            with self.assertRaises(SystemError) as raises:
+                out = np.zeros(
+                    a.size * b.size, dtype=np.result_type(a.dtype, b.dtype)
+                )
+                cfunc[1, 1](a, b, out)
+            self.assertIn(
+                "unknown error",
+                str(raises.exception),
+            )
+
+        check_incompatible(np.arange(2), np.arange(3))
+        a = np.arange(12).reshape((3, 4))
+        b = np.arange(3)
+        check_incompatible(a, b)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/numba_cuda/numba/cuda/typing/context.py b/numba_cuda/numba/cuda/typing/context.py
index 62c6cf938..324d4ab41 100644
--- a/numba_cuda/numba/cuda/typing/context.py
+++ b/numba_cuda/numba/cuda/typing/context.py
@@ -522,7 +522,13 @@ def is_external(obj):
             else:
                 # A type was already inserted, see if we can add to it
                 newty = existing.augment(gty)
-                if newty is None:
+                if (
+                    newty is None
+                    and ".".join(
+                        str(existing.__class__.__module__).split(".")[:2]
+                    )
+                    != "numba.core"
+                ):
                     raise TypeError(
                         "cannot augment %s with %s" % (existing, gty)
                     )
diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py
index 01d99e115..2b8b2374a 100644
--- a/numba_cuda/numba/cuda/ufuncs.py
+++ b/numba_cuda/numba/cuda/ufuncs.py
@@ -13,10 +13,6 @@
 import numpy as np
 from functools import lru_cache
 from numba.cuda import typing
-from numba.cuda.mathimpl import (
-    get_unary_impl_for_fn_and_ty,
-    get_binary_impl_for_fn_and_ty,
-)
 
 
 def get_ufunc_info(ufunc_key):
@@ -29,6 +25,10 @@ def ufunc_db():
     from numba.cuda.cpython import cmathimpl, mathimpl, numbers
     from numba.cuda.np import npyfuncs
     from numba.cuda.np.numpy_support import numpy_version
+    from numba.cuda.mathimpl import (
+        get_unary_impl_for_fn_and_ty,
+        get_binary_impl_for_fn_and_ty,
+    )
 
     def np_unary_impl(fn, context, builder, sig, args):
         npyfuncs._check_arity_and_homogeneity(sig, args, 1)

From 43641c72e9da49da37717499af505453efd08215 Mon Sep 17 00:00:00 2001
From: Atmn Patel <atmnp@nvidia.com>
Date: Mon, 20 Oct 2025 14:44:02 -0700
Subject: [PATCH 2/3] fix sim failures

---
 numba_cuda/numba/cuda/simulator/memory_management/nrt.py | 8 +++++++-
 numba_cuda/numba/cuda/tests/test_array_attr.py           | 4 ++++
 numba_cuda/numba/cuda/tests/test_array_iterators.py      | 4 ++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/simulator/memory_management/nrt.py b/numba_cuda/numba/cuda/simulator/memory_management/nrt.py
index 6097af3b7..c3c974028 100644
--- a/numba_cuda/numba/cuda/simulator/memory_management/nrt.py
+++ b/numba_cuda/numba/cuda/simulator/memory_management/nrt.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 from numba.cuda import config
+from collections import namedtuple
+
+_nrt_mstats = namedtuple("nrt_mstats", ["alloc", "free", "mi_alloc", "mi_free"])
 
 
 class RTSys:
@@ -11,9 +14,12 @@ def __init__(self, *args, **kwargs):
     def memsys_enable_stats(self):
         pass
 
-    def get_allocation_stats(self):
+    def memsys_disable_stats(self):
         pass
 
+    def get_allocation_stats(self):
+        return _nrt_mstats(alloc=0, free=0, mi_alloc=0, mi_free=0)
+
 
 rtsys = RTSys()
 
diff --git a/numba_cuda/numba/cuda/tests/test_array_attr.py b/numba_cuda/numba/cuda/tests/test_array_attr.py
index 8f52a7c4e..4a49b5bb6 100644
--- a/numba_cuda/numba/cuda/tests/test_array_attr.py
+++ b/numba_cuda/numba/cuda/tests/test_array_attr.py
@@ -12,6 +12,10 @@
 from numba.cuda.testing import CUDATestCase
 from numba.core.errors import TypingError
 from numba.cuda.tests.support import override_config
+from numba.cuda import config
+
+if config.ENABLE_CUDASIM:
+    raise unittest.SkipTest("Array attribute tests not done in simulator")
 
 
 def array_itemsize(a, res):
diff --git a/numba_cuda/numba/cuda/tests/test_array_iterators.py b/numba_cuda/numba/cuda/tests/test_array_iterators.py
index 0cd9c8d12..65345d8e1 100644
--- a/numba_cuda/numba/cuda/tests/test_array_iterators.py
+++ b/numba_cuda/numba/cuda/tests/test_array_iterators.py
@@ -11,6 +11,10 @@
 from numba.cuda.testing import CUDATestCase
 import unittest
 import itertools
+from numba.cuda import config
+
+if config.ENABLE_CUDASIM:
+    raise unittest.SkipTest("Array iterator tests not done in simulator")
 
 
 def array_iter(arr, out):

From 675fd400a04068a6efdf76af5bd58bf244f3eecf Mon Sep 17 00:00:00 2001
From: Atmn Patel <atmnp@nvidia.com>
Date: Mon, 3 Nov 2025 12:54:40 -0800
Subject: [PATCH 3/3] update skip_on_cudasim to have messages, and update
 errors import to use vendored in numba.cuda.core.errors

---
 numba_cuda/numba/cuda/tests/test_array_attr.py      | 2 +-
 numba_cuda/numba/cuda/tests/test_array_constants.py | 4 ++--
 numba_cuda/numba/cuda/tests/test_array_iterators.py | 2 +-
 numba_cuda/numba/cuda/typing/context.py             | 8 +-------
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/test_array_attr.py b/numba_cuda/numba/cuda/tests/test_array_attr.py
index 4a49b5bb6..09f461674 100644
--- a/numba_cuda/numba/cuda/tests/test_array_attr.py
+++ b/numba_cuda/numba/cuda/tests/test_array_attr.py
@@ -10,7 +10,7 @@
 from numba.core import types
 from numba.cuda.tests.support import MemoryLeakMixin
 from numba.cuda.testing import CUDATestCase
-from numba.core.errors import TypingError
+from numba.cuda.core.errors import TypingError
 from numba.cuda.tests.support import override_config
 from numba.cuda import config
 
diff --git a/numba_cuda/numba/cuda/tests/test_array_constants.py b/numba_cuda/numba/cuda/tests/test_array_constants.py
index e5a927e52..80c46a073 100644
--- a/numba_cuda/numba/cuda/tests/test_array_constants.py
+++ b/numba_cuda/numba/cuda/tests/test_array_constants.py
@@ -118,7 +118,7 @@ def bytes_as_const_array_kernel(output_array, size):
             output_array[i] = 111  # 'o'
 
 
-@skip_on_cudasim
+@skip_on_cudasim("CUDA simulator does not support array constants")
 class TestConstantArray(CUDATestCase):
     """
     Test array constants.
@@ -260,7 +260,7 @@ def big_array_kernel(input_array, output_array, size):
         np.testing.assert_array_equal(f_array.flatten(), result)
 
 
-@skip_on_cudasim
+@skip_on_cudasim("CUDA simulator does not support array constants")
 class TestConstantBytes(CUDATestCase):
     def test_constant_bytes(self):
         # Test constant bytes array in CUDA kernel
diff --git a/numba_cuda/numba/cuda/tests/test_array_iterators.py b/numba_cuda/numba/cuda/tests/test_array_iterators.py
index 65345d8e1..6653a9f5c 100644
--- a/numba_cuda/numba/cuda/tests/test_array_iterators.py
+++ b/numba_cuda/numba/cuda/tests/test_array_iterators.py
@@ -617,7 +617,7 @@ def test_nditer3(self):
     def test_errors(self):
         # Incompatible shapes
         pyfunc = np_nditer2b_err
-        cfunc = jit(debug=True)(pyfunc)
+        cfunc = jit(debug=True, opt=False)(pyfunc)
 
         self.disable_leak_check()
 
diff --git a/numba_cuda/numba/cuda/typing/context.py b/numba_cuda/numba/cuda/typing/context.py
index 81c5caef7..6c2c933bf 100644
--- a/numba_cuda/numba/cuda/typing/context.py
+++ b/numba_cuda/numba/cuda/typing/context.py
@@ -491,13 +491,7 @@ def is_external(obj):
             else:
                 # A type was already inserted, see if we can add to it
                 newty = existing.augment(gty)
-                if (
-                    newty is None
-                    and ".".join(
-                        str(existing.__class__.__module__).split(".")[:2]
-                    )
-                    != "numba.core"
-                ):
+                if newty is None:
                     raise TypeError(
                         "cannot augment %s with %s" % (existing, gty)
                     )