diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index 629f32362..07c5ae38c 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -21,9 +21,9 @@
 from numba.cuda.cudadrv import driver as _driver
 from numba.core import types
 from numba.cuda.core import config
-from numba.np.unsafe.ndarray import to_fixed_tuple
-from numba.np.numpy_support import numpy_version
-from numba.np import numpy_support
+from numba.cuda.np.unsafe.ndarray import to_fixed_tuple
+from numba.cuda.np.numpy_support import numpy_version
+from numba.cuda.np import numpy_support
 from numba.cuda.api_util import prepare_shape_strides_dtype
 from numba.core.errors import NumbaPerformanceWarning
 from warnings import warn
diff --git a/numba_cuda/numba/cuda/cudaimpl.py b/numba_cuda/numba/cuda/cudaimpl.py
index 3a326286c..c78e6c74a 100644
--- a/numba_cuda/numba/cuda/cudaimpl.py
+++ b/numba_cuda/numba/cuda/cudaimpl.py
@@ -14,8 +14,8 @@
 from numba.core.datamodel import models
 from numba.core import types
 from numba.cuda import cgutils
-from numba.np import ufunc_db
-from numba.np.npyimpl import register_ufuncs
+from numba.cuda.np import ufunc_db
+from numba.cuda.np.npyimpl import register_ufuncs
 from .cudadrv import nvvm
 from numba import cuda
 from numba.cuda import nvvmutils, stubs
diff --git a/numba_cuda/numba/cuda/kernels/reduction.py b/numba_cuda/numba/cuda/kernels/reduction.py
index 129f525bf..463db8846 100644
--- a/numba_cuda/numba/cuda/kernels/reduction.py
+++ b/numba_cuda/numba/cuda/kernels/reduction.py
@@ -5,7 +5,7 @@
 A library written in CUDA Python for generating reduction kernels
 """
 
-from numba.np.numpy_support import from_dtype
+from numba.cuda.np.numpy_support import from_dtype
 
 
 _WARPSIZE = 32
diff --git a/numba_cuda/numba/cuda/kernels/transpose.py b/numba_cuda/numba/cuda/kernels/transpose.py
index fd031d21d..01e2670b0 100644
--- a/numba_cuda/numba/cuda/kernels/transpose.py
+++ b/numba_cuda/numba/cuda/kernels/transpose.py
@@ -4,7 +4,7 @@
 from numba import cuda
 from numba.cuda.cudadrv.driver import driver
 import math
-from numba.np import numpy_support as nps
+from numba.cuda.np import numpy_support as nps
 
 
 def transpose(a, b=None):
diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py
new file mode 100644
index 000000000..fd7d1c7a3
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/arrayobj.py
@@ -0,0 +1,7679 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Implementation of operations on Array objects and objects supporting
+the buffer protocol.
+"""
+
+import functools
+import math
+import operator
+import textwrap
+
+from llvmlite import ir
+from llvmlite.ir import Constant
+
+import numpy as np
+
+from numba import pndindex, literal_unroll
+from numba.core import types, typing, errors, cgutils, extending
+from numba.cuda.np.numpy_support import (
+    as_dtype,
+    from_dtype,
+    carray,
+    farray,
+    is_contiguous,
+    is_fortran,
+    check_is_integer,
+    type_is_scalar,
+    lt_complex,
+    lt_floats,
+)
+from numba.cuda.np.numpy_support import type_can_asarray, numpy_version
+from numba.cuda.cgutils import is_nonelike
+from numba.core.imputils import (
+    lower_builtin,
+    lower_getattr,
+    lower_getattr_generic,
+    lower_setattr_generic,
+    lower_cast,
+    lower_constant,
+    iternext_impl,
+    impl_ret_borrowed,
+    impl_ret_new_ref,
+    impl_ret_untracked,
+    RefType,
+)
+from numba.cuda.typing import signature
+from numba.core.types import StringLiteral
+from numba.core.extending import (
+    register_jitable,
+    overload,
+    overload_method,
+    intrinsic,
+    overload_attribute,
+)
+from numba.misc import quicksort, mergesort
+from numba.cpython import slicing
+from numba.cpython.unsafe.tuple import tuple_setitem, build_full_slice_tuple
+from numba.core.extending import overload_classmethod
+from numba.core.typing.npydecl import (
+    parse_dtype as ty_parse_dtype,
+    parse_shape as ty_parse_shape,
+    _parse_nested_sequence,
+    _sequence_of_arrays,
+    _choose_concatenation_layout,
+)
+
+
+def set_range_metadata(builder, load, lower_bound, upper_bound):
+    """
+    Set the "range" metadata on a load instruction.
+    Note the interval is in the form [lower_bound, upper_bound).
+    """
+    range_operands = [
+        Constant(load.type, lower_bound),
+        Constant(load.type, upper_bound),
+    ]
+    md = builder.module.add_metadata(range_operands)
+    load.set_metadata("range", md)
+
+
+def mark_positive(builder, load):
+    """
+    Mark the result of a load instruction as positive (or zero).
+    """
+    upper_bound = (1 << (load.type.width - 1)) - 1
+    set_range_metadata(builder, load, 0, upper_bound)
+
+
+def make_array(array_type):
+    """
+    Return the Structure representation of the given *array_type*
+    (an instance of types.ArrayCompatible).
+
+    Note this does not call __array_wrap__ in case a new array structure
+    is being created (rather than populated).
+    """
+    real_array_type = array_type.as_array
+    base = cgutils.create_struct_proxy(real_array_type)
+    ndim = real_array_type.ndim
+
+    class ArrayStruct(base):
+        def _make_refs(self, ref):
+            sig = signature(real_array_type, array_type)
+            try:
+                array_impl = self._context.get_function("__array__", sig)
+            except NotImplementedError:
+                return super(ArrayStruct, self)._make_refs(ref)
+
+            # Return a wrapped structure and its unwrapped reference
+            datamodel = self._context.data_model_manager[array_type]
+            be_type = self._get_be_type(datamodel)
+            if ref is None:
+                outer_ref = cgutils.alloca_once(
+                    self._builder, be_type, zfill=True
+                )
+            else:
+                outer_ref = ref
+            # NOTE: __array__ is called with a pointer and expects a pointer
+            # in return!
+            ref = array_impl(self._builder, (outer_ref,))
+            return outer_ref, ref
+
+        @property
+        def shape(self):
+            """
+            Override .shape to inform LLVM that its elements are all positive.
+            """
+            builder = self._builder
+            if ndim == 0:
+                return base.__getattr__(self, "shape")
+
+            # Unfortunately, we can't use llvm.assume as its presence can
+            # seriously pessimize performance,
+            # *and* the range metadata currently isn't improving anything here,
+            # see https://llvm.org/bugs/show_bug.cgi?id=23848 !
+            ptr = self._get_ptr_by_name("shape")
+            dims = []
+            for i in range(ndim):
+                dimptr = cgutils.gep_inbounds(builder, ptr, 0, i)
+                load = builder.load(dimptr)
+                dims.append(load)
+                mark_positive(builder, load)
+
+            return cgutils.pack_array(builder, dims)
+
+    return ArrayStruct
+
+
+def get_itemsize(context, array_type):
+    """
+    Return the item size for the given array or buffer type.
+    """
+    llty = context.get_data_type(array_type.dtype)
+    return context.get_abi_sizeof(llty)
+
+
+def load_item(context, builder, arrayty, ptr):
+    """
+    Load the item at the given array pointer.
+    """
+    align = None if arrayty.aligned else 1
+    return context.unpack_value(builder, arrayty.dtype, ptr, align=align)
+
+
+def store_item(context, builder, arrayty, val, ptr):
+    """
+    Store the item at the given array pointer.
+    """
+    align = None if arrayty.aligned else 1
+    return context.pack_value(builder, arrayty.dtype, val, ptr, align=align)
+
+
+def fix_integer_index(context, builder, idxty, idx, size):
+    """
+    Fix the integer index' type and value for the given dimension size.
+    """
+    if idxty.signed:
+        ind = context.cast(builder, idx, idxty, types.intp)
+        ind = slicing.fix_index(builder, ind, size)
+    else:
+        ind = context.cast(builder, idx, idxty, types.uintp)
+    return ind
+
+
+def normalize_index(context, builder, idxty, idx):
+    """
+    Normalize the index type and value.  0-d arrays are converted to scalars.
+    """
+    if isinstance(idxty, types.Array) and idxty.ndim == 0:
+        assert isinstance(idxty.dtype, types.Integer)
+        idxary = make_array(idxty)(context, builder, idx)
+        idxval = load_item(context, builder, idxty, idxary.data)
+        return idxty.dtype, idxval
+    else:
+        return idxty, idx
+
+
+def normalize_indices(context, builder, index_types, indices):
+    """
+    Same as normalize_index(), but operating on sequences of
+    index types and values.
+    """
+    if len(indices):
+        index_types, indices = zip(
+            *[
+                normalize_index(context, builder, idxty, idx)
+                for idxty, idx in zip(index_types, indices)
+            ]
+        )
+    return index_types, indices
+
+
+def populate_array(array, data, shape, strides, itemsize, meminfo, parent=None):
+    """
+    Helper function for populating array structures.
+    This avoids forgetting to set fields.
+
+    *shape* and *strides* can be Python tuples or LLVM arrays.
+    """
+    context = array._context
+    builder = array._builder
+    datamodel = array._datamodel
+    # doesn't matter what this array type instance is, it's just to get the
+    # fields for the datamodel of the standard array type in this context
+    standard_array = types.Array(types.float64, 1, "C")
+    standard_array_type_datamodel = context.data_model_manager[standard_array]
+    required_fields = set(standard_array_type_datamodel._fields)
+    datamodel_fields = set(datamodel._fields)
+    # Make sure that the presented array object has a data model that is close
+    # enough to an array for this function to proceed.
+    if (required_fields & datamodel_fields) != required_fields:
+        missing = required_fields - datamodel_fields
+        msg = (
+            f"The datamodel for type {array._fe_type} is missing "
+            f"field{'s' if len(missing) > 1 else ''} {missing}."
+        )
+        raise ValueError(msg)
+
+    if meminfo is None:
+        meminfo = Constant(
+            context.get_value_type(datamodel.get_type("meminfo")), None
+        )
+
+    intp_t = context.get_value_type(types.intp)
+    if isinstance(shape, (tuple, list)):
+        shape = cgutils.pack_array(builder, shape, intp_t)
+    if isinstance(strides, (tuple, list)):
+        strides = cgutils.pack_array(builder, strides, intp_t)
+    if isinstance(itemsize, int):
+        itemsize = intp_t(itemsize)
+
+    attrs = dict(
+        shape=shape,
+        strides=strides,
+        data=data,
+        itemsize=itemsize,
+        meminfo=meminfo,
+    )
+
+    # Set `parent` attribute
+    if parent is None:
+        attrs["parent"] = Constant(
+            context.get_value_type(datamodel.get_type("parent")), None
+        )
+    else:
+        attrs["parent"] = parent
+    # Calc num of items from shape
+    nitems = context.get_constant(types.intp, 1)
+    unpacked_shape = cgutils.unpack_tuple(builder, shape, shape.type.count)
+    # (note empty shape => 0d array therefore nitems = 1)
+    for axlen in unpacked_shape:
+        nitems = builder.mul(nitems, axlen, flags=["nsw"])
+    attrs["nitems"] = nitems
+
+    # Make sure that we have all the fields
+    got_fields = set(attrs.keys())
+    if got_fields != required_fields:
+        raise ValueError("missing {0}".format(required_fields - got_fields))
+
+    # Set field value
+    for k, v in attrs.items():
+        setattr(array, k, v)
+
+    return array
+
+
+def update_array_info(aryty, array):
+    """
+    Update some auxiliary information in *array* after some of its fields
+    were changed.  `itemsize` and `nitems` are updated.
+    """
+    context = array._context
+    builder = array._builder
+
+    # Calc num of items from shape
+    nitems = context.get_constant(types.intp, 1)
+    unpacked_shape = cgutils.unpack_tuple(builder, array.shape, aryty.ndim)
+    for axlen in unpacked_shape:
+        nitems = builder.mul(nitems, axlen, flags=["nsw"])
+    array.nitems = nitems
+
+    array.itemsize = context.get_constant(
+        types.intp, get_itemsize(context, aryty)
+    )
+
+
+def normalize_axis(func_name, arg_name, ndim, axis):
+    """Constrain axis values to valid positive values."""
+    raise NotImplementedError()
+
+
+@overload(normalize_axis)
+def normalize_axis_overloads(func_name, arg_name, ndim, axis):
+    if not isinstance(func_name, StringLiteral):
+        raise errors.TypingError("func_name must be a str literal.")
+    if not isinstance(arg_name, StringLiteral):
+        raise errors.TypingError("arg_name must be a str literal.")
+
+    msg = (
+        f"{func_name.literal_value}: Argument {arg_name.literal_value} "
+        "out of bounds for dimensions of the array"
+    )
+
+    def impl(func_name, arg_name, ndim, axis):
+        if axis < 0:
+            axis += ndim
+        if axis < 0 or axis >= ndim:
+            raise ValueError(msg)
+
+        return axis
+
+    return impl
+
+
+@lower_builtin("getiter", types.Buffer)
+def getiter_array(context, builder, sig, args):
+    [arrayty] = sig.args
+    [array] = args
+
+    iterobj = context.make_helper(builder, sig.return_type)
+
+    zero = context.get_constant(types.intp, 0)
+    indexptr = cgutils.alloca_once_value(builder, zero)
+
+    iterobj.index = indexptr
+    iterobj.array = array
+
+    # Incref array
+    if context.enable_nrt:
+        context.nrt.incref(builder, arrayty, array)
+
+    res = iterobj._getvalue()
+
+    # Note: a decref on the iterator will dereference all internal MemInfo*
+    out = impl_ret_new_ref(context, builder, sig.return_type, res)
+    return out
+
+
+def _getitem_array_single_int(context, builder, return_type, aryty, ary, idx):
+    """Evaluate `ary[idx]`, where idx is a single int."""
+    # optimized form of _getitem_array_generic
+    shapes = cgutils.unpack_tuple(builder, ary.shape, count=aryty.ndim)
+    strides = cgutils.unpack_tuple(builder, ary.strides, count=aryty.ndim)
+    offset = builder.mul(strides[0], idx)
+    dataptr = cgutils.pointer_add(builder, ary.data, offset)
+    view_shapes = shapes[1:]
+    view_strides = strides[1:]
+
+    if isinstance(return_type, types.Buffer):
+        # Build array view
+        retary = make_view(
+            context,
+            builder,
+            aryty,
+            ary,
+            return_type,
+            dataptr,
+            view_shapes,
+            view_strides,
+        )
+        return retary._getvalue()
+    else:
+        # Load scalar from 0-d result
+        assert not view_shapes
+        return load_item(context, builder, aryty, dataptr)
+
+
+@lower_builtin("iternext", types.ArrayIterator)
+@iternext_impl(RefType.BORROWED)
+def iternext_array(context, builder, sig, args, result):
+    [iterty] = sig.args
+    [iter] = args
+    arrayty = iterty.array_type
+
+    iterobj = context.make_helper(builder, iterty, value=iter)
+    ary = make_array(arrayty)(context, builder, value=iterobj.array)
+
+    (nitems,) = cgutils.unpack_tuple(builder, ary.shape, count=1)
+
+    index = builder.load(iterobj.index)
+    is_valid = builder.icmp_signed("<", index, nitems)
+    result.set_valid(is_valid)
+
+    with builder.if_then(is_valid):
+        value = _getitem_array_single_int(
+            context, builder, iterty.yield_type, arrayty, ary, index
+        )
+        result.yield_(value)
+        nindex = cgutils.increment_index(builder, index)
+        builder.store(nindex, iterobj.index)
+
+
+# ------------------------------------------------------------------------------
+# Basic indexing (with integers and slices only)
+
+
+def basic_indexing(
+    context, builder, aryty, ary, index_types, indices, boundscheck=None
+):
+    """
+    Perform basic indexing on the given array.
+    A (data pointer, shapes, strides) tuple is returned describing
+    the corresponding view.
+    """
+    zero = context.get_constant(types.intp, 0)
+    one = context.get_constant(types.intp, 1)
+
+    shapes = cgutils.unpack_tuple(builder, ary.shape, aryty.ndim)
+    strides = cgutils.unpack_tuple(builder, ary.strides, aryty.ndim)
+
+    output_indices = []
+    output_shapes = []
+    output_strides = []
+
+    num_newaxes = len([idx for idx in index_types if is_nonelike(idx)])
+    ax = 0
+    for indexval, idxty in zip(indices, index_types):
+        if idxty is types.ellipsis:
+            # Fill up missing dimensions at the middle
+            n_missing = aryty.ndim - len(indices) + 1 + num_newaxes
+            for i in range(n_missing):
+                output_indices.append(zero)
+                output_shapes.append(shapes[ax])
+                output_strides.append(strides[ax])
+                ax += 1
+            continue
+        # Regular index value
+        if isinstance(idxty, types.SliceType):
+            slice = context.make_helper(builder, idxty, value=indexval)
+            slicing.guard_invalid_slice(context, builder, idxty, slice)
+            slicing.fix_slice(builder, slice, shapes[ax])
+            output_indices.append(slice.start)
+            sh = slicing.get_slice_length(builder, slice)
+            st = slicing.fix_stride(builder, slice, strides[ax])
+            output_shapes.append(sh)
+            output_strides.append(st)
+        elif isinstance(idxty, types.Integer):
+            ind = fix_integer_index(
+                context, builder, idxty, indexval, shapes[ax]
+            )
+            if boundscheck:
+                cgutils.do_boundscheck(context, builder, ind, shapes[ax], ax)
+            output_indices.append(ind)
+        elif is_nonelike(idxty):
+            output_shapes.append(one)
+            output_strides.append(zero)
+            ax -= 1
+        else:
+            raise NotImplementedError("unexpected index type: %s" % (idxty,))
+        ax += 1
+
+    # Fill up missing dimensions at the end
+    assert ax <= aryty.ndim
+    while ax < aryty.ndim:
+        output_shapes.append(shapes[ax])
+        output_strides.append(strides[ax])
+        ax += 1
+
+    # No need to check wraparound, as negative indices were already
+    # fixed in the loop above.
+    dataptr = cgutils.get_item_pointer(
+        context,
+        builder,
+        aryty,
+        ary,
+        output_indices,
+        wraparound=False,
+        boundscheck=False,
+    )
+    return (dataptr, output_shapes, output_strides)
+
+
+def make_view(context, builder, aryty, ary, return_type, data, shapes, strides):
+    """
+    Build a view over the given array with the given parameters.
+    """
+    retary = make_array(return_type)(context, builder)
+    populate_array(
+        retary,
+        data=data,
+        shape=shapes,
+        strides=strides,
+        itemsize=ary.itemsize,
+        meminfo=ary.meminfo,
+        parent=ary.parent,
+    )
+    return retary
+
+
+def _getitem_array_generic(
+    context, builder, return_type, aryty, ary, index_types, indices
+):
+    """
+    Return the result of indexing *ary* with the given *indices*,
+    returning either a scalar or a view.
+    """
+    dataptr, view_shapes, view_strides = basic_indexing(
+        context,
+        builder,
+        aryty,
+        ary,
+        index_types,
+        indices,
+        boundscheck=context.enable_boundscheck,
+    )
+
+    if isinstance(return_type, types.Buffer):
+        # Build array view
+        retary = make_view(
+            context,
+            builder,
+            aryty,
+            ary,
+            return_type,
+            dataptr,
+            view_shapes,
+            view_strides,
+        )
+        return retary._getvalue()
+    else:
+        # Load scalar from 0-d result
+        assert not view_shapes
+        return load_item(context, builder, aryty, dataptr)
+
+
+@lower_builtin(operator.getitem, types.Buffer, types.Integer)
+@lower_builtin(operator.getitem, types.Buffer, types.SliceType)
+def getitem_arraynd_intp(context, builder, sig, args):
+    """
+    Basic indexing with an integer or a slice.
+    """
+    aryty, idxty = sig.args
+    ary, idx = args
+
+    assert aryty.ndim >= 1
+    ary = make_array(aryty)(context, builder, ary)
+
+    res = _getitem_array_generic(
+        context, builder, sig.return_type, aryty, ary, (idxty,), (idx,)
+    )
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin(operator.getitem, types.Buffer, types.BaseTuple)
+def getitem_array_tuple(context, builder, sig, args):
+    """
+    Basic or advanced indexing with a tuple.
+    """
+    aryty, tupty = sig.args
+    ary, tup = args
+    ary = make_array(aryty)(context, builder, ary)
+
+    index_types = tupty.types
+    indices = cgutils.unpack_tuple(builder, tup, count=len(tupty))
+
+    index_types, indices = normalize_indices(
+        context, builder, index_types, indices
+    )
+
+    if any(isinstance(ty, types.Array) for ty in index_types):
+        # Advanced indexing
+        return fancy_getitem(
+            context, builder, sig, args, aryty, ary, index_types, indices
+        )
+
+    res = _getitem_array_generic(
+        context, builder, sig.return_type, aryty, ary, index_types, indices
+    )
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin(operator.setitem, types.Buffer, types.Any, types.Any)
+def setitem_array(context, builder, sig, args):
+    """
+    array[a] = scalar_or_array
+    array[a,..,b] = scalar_or_array
+    """
+    aryty, idxty, valty = sig.args
+    ary, idx, val = args
+
+    if isinstance(idxty, types.BaseTuple):
+        index_types = idxty.types
+        indices = cgutils.unpack_tuple(builder, idx, count=len(idxty))
+    else:
+        index_types = (idxty,)
+        indices = (idx,)
+
+    ary = make_array(aryty)(context, builder, ary)
+
+    # First try basic indexing to see if a single array location is denoted.
+    index_types, indices = normalize_indices(
+        context, builder, index_types, indices
+    )
+    try:
+        dataptr, shapes, strides = basic_indexing(
+            context,
+            builder,
+            aryty,
+            ary,
+            index_types,
+            indices,
+            boundscheck=context.enable_boundscheck,
+        )
+    except NotImplementedError:
+        use_fancy_indexing = True
+    else:
+        use_fancy_indexing = bool(shapes)
+
+    if use_fancy_indexing:
+        # Index describes a non-trivial view => use generic slice assignment
+        # (NOTE: this also handles scalar broadcasting)
+        return fancy_setslice(context, builder, sig, args, index_types, indices)
+
+    # Store source value the given location
+    val = context.cast(builder, val, valty, aryty.dtype)
+    store_item(context, builder, aryty, val, dataptr)
+
+
+@lower_builtin(len, types.Buffer)
+def array_len(context, builder, sig, args):
+    (aryty,) = sig.args
+    (ary,) = args
+    arystty = make_array(aryty)
+    ary = arystty(context, builder, ary)
+    shapeary = ary.shape
+    res = builder.extract_value(shapeary, 0)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+@lower_builtin("array.item", types.Array)
+def array_item(context, builder, sig, args):
+    (aryty,) = sig.args
+    (ary,) = args
+    ary = make_array(aryty)(context, builder, ary)
+
+    nitems = ary.nitems
+    with builder.if_then(
+        builder.icmp_signed("!=", nitems, nitems.type(1)), likely=False
+    ):
+        msg = "item(): can only convert an array of size 1 to a Python scalar"
+        context.call_conv.return_user_exc(builder, ValueError, (msg,))
+
+    return load_item(context, builder, aryty, ary.data)
+
+
+if numpy_version < (2, 0):
+
+    @lower_builtin("array.itemset", types.Array, types.Any)
+    def array_itemset(context, builder, sig, args):
+        aryty, valty = sig.args
+        ary, val = args
+        assert valty == aryty.dtype
+        ary = make_array(aryty)(context, builder, ary)
+
+        nitems = ary.nitems
+        with builder.if_then(
+            builder.icmp_signed("!=", nitems, nitems.type(1)), likely=False
+        ):
+            msg = "itemset(): can only write to an array of size 1"
+            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+
+        store_item(context, builder, aryty, val, ary.data)
+        return context.get_dummy_value()
+
+
+# ------------------------------------------------------------------------------
+# Advanced / fancy indexing
+
+
+class Indexer(object):
+    """
+    Generic indexer interface, for generating indices over a fancy indexed
+    array on a single dimension.
+    """
+
+    def prepare(self):
+        """
+        Prepare the indexer by initializing any required variables, basic
+        blocks...
+        """
+        raise NotImplementedError
+
+    def get_size(self):
+        """
+        Return this dimension's size as an integer.
+        """
+        raise NotImplementedError
+
+    def get_shape(self):
+        """
+        Return this dimension's shape as a tuple.
+        """
+        raise NotImplementedError
+
+    def get_index_bounds(self):
+        """
+        Return a half-open [lower, upper) range of indices this dimension
+        is guaranteed not to step out of.
+        """
+        raise NotImplementedError
+
+    def loop_head(self):
+        """
+        Start indexation loop.  Return a (index, count) tuple.
+        *index* is an integer LLVM value representing the index over this
+        dimension.
+        *count* is either an integer LLVM value representing the current
+        iteration count, or None if this dimension should be omitted from
+        the indexation result.
+        """
+        raise NotImplementedError
+
+    def loop_tail(self):
+        """
+        Finish indexation loop.
+        """
+        raise NotImplementedError
+
+
+class EntireIndexer(Indexer):
+    """
+    Compute indices along an entire array dimension.
+    """
+
+    def __init__(self, context, builder, aryty, ary, dim):
+        self.context = context
+        self.builder = builder
+        self.aryty = aryty
+        self.ary = ary
+        self.dim = dim
+        self.ll_intp = self.context.get_value_type(types.intp)
+
+    def prepare(self):
+        builder = self.builder
+        self.size = builder.extract_value(self.ary.shape, self.dim)
+        self.index = cgutils.alloca_once(builder, self.ll_intp)
+        self.bb_start = builder.append_basic_block()
+        self.bb_end = builder.append_basic_block()
+
+    def get_size(self):
+        return self.size
+
+    def get_shape(self):
+        return (self.size,)
+
+    def get_index_bounds(self):
+        # [0, size)
+        return (self.ll_intp(0), self.size)
+
+    def loop_head(self):
+        builder = self.builder
+        # Initialize loop variable
+        self.builder.store(Constant(self.ll_intp, 0), self.index)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_start)
+        cur_index = builder.load(self.index)
+        with builder.if_then(
+            builder.icmp_signed(">=", cur_index, self.size), likely=False
+        ):
+            builder.branch(self.bb_end)
+        return cur_index, cur_index
+
+    def loop_tail(self):
+        builder = self.builder
+        next_index = cgutils.increment_index(builder, builder.load(self.index))
+        builder.store(next_index, self.index)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_end)
+
+
+class IntegerIndexer(Indexer):
+    """
+    Compute indices from a single integer.
+    """
+
+    def __init__(self, context, builder, idx):
+        self.context = context
+        self.builder = builder
+        self.idx = idx
+        self.ll_intp = self.context.get_value_type(types.intp)
+
+    def prepare(self):
+        pass
+
+    def get_size(self):
+        return Constant(self.ll_intp, 1)
+
+    def get_shape(self):
+        return ()
+
+    def get_index_bounds(self):
+        # [idx, idx+1)
+        return (self.idx, self.builder.add(self.idx, self.get_size()))
+
+    def loop_head(self):
+        return self.idx, None
+
+    def loop_tail(self):
+        pass
+
+
+class IntegerArrayIndexer(Indexer):
+    """
+    Compute indices from an array of integer indices.
+    """
+
+    def __init__(self, context, builder, idxty, idxary, size):
+        self.context = context
+        self.builder = builder
+        self.idxty = idxty
+        self.idxary = idxary
+        self.size = size
+        assert idxty.ndim == 1
+        self.ll_intp = self.context.get_value_type(types.intp)
+
+    def prepare(self):
+        builder = self.builder
+        self.idx_size = cgutils.unpack_tuple(builder, self.idxary.shape)[0]
+        self.idx_index = cgutils.alloca_once(builder, self.ll_intp)
+        self.bb_start = builder.append_basic_block()
+        self.bb_end = builder.append_basic_block()
+
+    def get_size(self):
+        return self.idx_size
+
+    def get_shape(self):
+        return (self.idx_size,)
+
+    def get_index_bounds(self):
+        # Pessimal heuristic, as we don't want to scan for the min and max
+        return (self.ll_intp(0), self.size)
+
+    def loop_head(self):
+        builder = self.builder
+        # Initialize loop variable
+        self.builder.store(Constant(self.ll_intp, 0), self.idx_index)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_start)
+        cur_index = builder.load(self.idx_index)
+        with builder.if_then(
+            builder.icmp_signed(">=", cur_index, self.idx_size), likely=False
+        ):
+            builder.branch(self.bb_end)
+        # Load the actual index from the array of indices
+        index = _getitem_array_single_int(
+            self.context,
+            builder,
+            self.idxty.dtype,
+            self.idxty,
+            self.idxary,
+            cur_index,
+        )
+        index = fix_integer_index(
+            self.context, builder, self.idxty.dtype, index, self.size
+        )
+        return index, cur_index
+
+    def loop_tail(self):
+        builder = self.builder
+        next_index = cgutils.increment_index(
+            builder, builder.load(self.idx_index)
+        )
+        builder.store(next_index, self.idx_index)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_end)
+
+
+class BooleanArrayIndexer(Indexer):
+    """
+    Compute indices from an array of boolean predicates.
+    """
+
+    def __init__(self, context, builder, idxty, idxary):
+        self.context = context
+        self.builder = builder
+        self.idxty = idxty
+        self.idxary = idxary
+        assert idxty.ndim == 1
+        self.ll_intp = self.context.get_value_type(types.intp)
+        self.zero = Constant(self.ll_intp, 0)
+
+    def prepare(self):
+        builder = self.builder
+        self.size = cgutils.unpack_tuple(builder, self.idxary.shape)[0]
+        self.idx_index = cgutils.alloca_once(builder, self.ll_intp)
+        self.count = cgutils.alloca_once(builder, self.ll_intp)
+        self.bb_start = builder.append_basic_block()
+        self.bb_tail = builder.append_basic_block()
+        self.bb_end = builder.append_basic_block()
+
+    def get_size(self):
+        builder = self.builder
+        count = cgutils.alloca_once_value(builder, self.zero)
+        # Sum all true values
+        with cgutils.for_range(builder, self.size) as loop:
+            c = builder.load(count)
+            pred = _getitem_array_single_int(
+                self.context,
+                builder,
+                self.idxty.dtype,
+                self.idxty,
+                self.idxary,
+                loop.index,
+            )
+            c = builder.add(c, builder.zext(pred, c.type))
+            builder.store(c, count)
+
+        return builder.load(count)
+
+    def get_shape(self):
+        return (self.get_size(),)
+
+    def get_index_bounds(self):
+        # Pessimal heuristic, as we don't want to scan for the
+        # first and last true items
+        return (self.ll_intp(0), self.size)
+
+    def loop_head(self):
+        builder = self.builder
+        # Initialize loop variable
+        self.builder.store(self.zero, self.idx_index)
+        self.builder.store(self.zero, self.count)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_start)
+        cur_index = builder.load(self.idx_index)
+        cur_count = builder.load(self.count)
+        with builder.if_then(
+            builder.icmp_signed(">=", cur_index, self.size), likely=False
+        ):
+            builder.branch(self.bb_end)
+        # Load the predicate and branch if false
+        pred = _getitem_array_single_int(
+            self.context,
+            builder,
+            self.idxty.dtype,
+            self.idxty,
+            self.idxary,
+            cur_index,
+        )
+        with builder.if_then(builder.not_(pred)):
+            builder.branch(self.bb_tail)
+        # Increment the count for next iteration
+        next_count = cgutils.increment_index(builder, cur_count)
+        builder.store(next_count, self.count)
+        return cur_index, cur_count
+
+    def loop_tail(self):
+        builder = self.builder
+        builder.branch(self.bb_tail)
+        builder.position_at_end(self.bb_tail)
+        next_index = cgutils.increment_index(
+            builder, builder.load(self.idx_index)
+        )
+        builder.store(next_index, self.idx_index)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_end)
+
+
+class SliceIndexer(Indexer):
+    """
+    Compute indices along a slice.
+    """
+
+    def __init__(self, context, builder, aryty, ary, dim, idxty, slice):
+        self.context = context
+        self.builder = builder
+        self.aryty = aryty
+        self.ary = ary
+        self.dim = dim
+        self.idxty = idxty
+        self.slice = slice
+        self.ll_intp = self.context.get_value_type(types.intp)
+        self.zero = Constant(self.ll_intp, 0)
+
+    def prepare(self):
+        builder = self.builder
+        # Fix slice for the dimension's size
+        self.dim_size = builder.extract_value(self.ary.shape, self.dim)
+        slicing.guard_invalid_slice(
+            self.context, builder, self.idxty, self.slice
+        )
+        slicing.fix_slice(builder, self.slice, self.dim_size)
+        self.is_step_negative = cgutils.is_neg_int(builder, self.slice.step)
+        # Create loop entities
+        self.index = cgutils.alloca_once(builder, self.ll_intp)
+        self.count = cgutils.alloca_once(builder, self.ll_intp)
+        self.bb_start = builder.append_basic_block()
+        self.bb_end = builder.append_basic_block()
+
+    def get_size(self):
+        return slicing.get_slice_length(self.builder, self.slice)
+
+    def get_shape(self):
+        return (self.get_size(),)
+
+    def get_index_bounds(self):
+        lower, upper = slicing.get_slice_bounds(self.builder, self.slice)
+        return lower, upper
+
+    def loop_head(self):
+        builder = self.builder
+        # Initialize loop variable
+        self.builder.store(self.slice.start, self.index)
+        self.builder.store(self.zero, self.count)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_start)
+        cur_index = builder.load(self.index)
+        cur_count = builder.load(self.count)
+        is_finished = builder.select(
+            self.is_step_negative,
+            builder.icmp_signed("<=", cur_index, self.slice.stop),
+            builder.icmp_signed(">=", cur_index, self.slice.stop),
+        )
+        with builder.if_then(is_finished, likely=False):
+            builder.branch(self.bb_end)
+        return cur_index, cur_count
+
+    def loop_tail(self):
+        builder = self.builder
+        next_index = builder.add(
+            builder.load(self.index), self.slice.step, flags=["nsw"]
+        )
+        builder.store(next_index, self.index)
+        next_count = cgutils.increment_index(builder, builder.load(self.count))
+        builder.store(next_count, self.count)
+        builder.branch(self.bb_start)
+        builder.position_at_end(self.bb_end)
+
+
+class FancyIndexer(object):
+    """
+    Perform fancy indexing on the given array.
+    """
+
+    def __init__(self, context, builder, aryty, ary, index_types, indices):
+        self.context = context
+        self.builder = builder
+        self.aryty = aryty
+        self.shapes = cgutils.unpack_tuple(builder, ary.shape, aryty.ndim)
+        self.strides = cgutils.unpack_tuple(builder, ary.strides, aryty.ndim)
+        self.ll_intp = self.context.get_value_type(types.intp)
+        self.newaxes = []
+
+        indexers = []
+        num_newaxes = len([idx for idx in index_types if is_nonelike(idx)])
+
+        ax = 0  # keeps track of position of original axes
+        new_ax = 0  # keeps track of position for inserting new axes
+        for indexval, idxty in zip(indices, index_types):
+            if idxty is types.ellipsis:
+                # Fill up missing dimensions at the middle
+                n_missing = aryty.ndim - len(indices) + 1 + num_newaxes
+                for i in range(n_missing):
+                    indexer = EntireIndexer(context, builder, aryty, ary, ax)
+                    indexers.append(indexer)
+                    ax += 1
+                    new_ax += 1
+                continue
+
+            # Regular index value
+            if isinstance(idxty, types.SliceType):
+                slice = context.make_helper(builder, idxty, indexval)
+                indexer = SliceIndexer(
+                    context, builder, aryty, ary, ax, idxty, slice
+                )
+                indexers.append(indexer)
+            elif isinstance(idxty, types.Integer):
+                ind = fix_integer_index(
+                    context, builder, idxty, indexval, self.shapes[ax]
+                )
+                indexer = IntegerIndexer(context, builder, ind)
+                indexers.append(indexer)
+            elif isinstance(idxty, types.Array):
+                idxary = make_array(idxty)(context, builder, indexval)
+                if isinstance(idxty.dtype, types.Integer):
+                    indexer = IntegerArrayIndexer(
+                        context, builder, idxty, idxary, self.shapes[ax]
+                    )
+                elif isinstance(idxty.dtype, types.Boolean):
+                    indexer = BooleanArrayIndexer(
+                        context, builder, idxty, idxary
+                    )
+                else:
+                    assert 0
+                indexers.append(indexer)
+            elif is_nonelike(idxty):
+                self.newaxes.append(new_ax)
+                ax -= 1
+            else:
+                raise AssertionError("unexpected index type: %s" % (idxty,))
+            ax += 1
+            new_ax += 1
+
+        # Fill up missing dimensions at the end
+        assert ax <= aryty.ndim, (ax, aryty.ndim)
+        while ax < aryty.ndim:
+            indexer = EntireIndexer(context, builder, aryty, ary, ax)
+            indexers.append(indexer)
+            ax += 1
+
+        assert len(indexers) == aryty.ndim, (len(indexers), aryty.ndim)
+        self.indexers = indexers
+
+    def prepare(self):
+        for i in self.indexers:
+            i.prepare()
+
+        one = self.context.get_constant(types.intp, 1)
+
+        # Compute the resulting shape given by the indices
+        res_shape = [i.get_shape() for i in self.indexers]
+
+        # At every position where newaxis/None is present insert
+        # one as a constant shape in the resulting list of shapes.
+        for i in self.newaxes:
+            res_shape.insert(i, (one,))
+
+        # Store the shape as a tuple, we can't do a simple
+        # tuple(res_shape) here since res_shape is a list
+        # of tuples which may be differently sized.
+        self.indexers_shape = sum(res_shape, ())
+
+    def get_shape(self):
+        """
+        Get the resulting data shape as Python tuple.
+        """
+        return self.indexers_shape
+
+    def get_offset_bounds(self, strides, itemsize):
+        """
+        Get a half-open [lower, upper) range of byte offsets spanned by
+        the indexer with the given strides and itemsize.  The indexer is
+        guaranteed to not go past those bounds.
+        """
+        assert len(strides) == self.aryty.ndim
+        builder = self.builder
+        is_empty = cgutils.false_bit
+        zero = self.ll_intp(0)
+        one = self.ll_intp(1)
+        lower = zero
+        upper = zero
+        for indexer, shape, stride in zip(
+            self.indexers, self.indexers_shape, strides
+        ):
+            is_empty = builder.or_(
+                is_empty, builder.icmp_unsigned("==", shape, zero)
+            )
+            # Compute [lower, upper) indices on this dimension
+            lower_index, upper_index = indexer.get_index_bounds()
+            lower_offset = builder.mul(stride, lower_index)
+            upper_offset = builder.mul(stride, builder.sub(upper_index, one))
+            # Adjust total interval
+            is_downwards = builder.icmp_signed("<", stride, zero)
+            lower = builder.add(
+                lower, builder.select(is_downwards, upper_offset, lower_offset)
+            )
+            upper = builder.add(
+                upper, builder.select(is_downwards, lower_offset, upper_offset)
+            )
+        # Make interval half-open
+        upper = builder.add(upper, itemsize)
+        # Adjust for empty shape
+        lower = builder.select(is_empty, zero, lower)
+        upper = builder.select(is_empty, zero, upper)
+        return lower, upper
+
+    def begin_loops(self):
+        indices, counts = zip(*(i.loop_head() for i in self.indexers))
+        return indices, counts
+
+    def end_loops(self):
+        for i in reversed(self.indexers):
+            i.loop_tail()
+
+
+def fancy_getitem(
+    context, builder, sig, args, aryty, ary, index_types, indices
+):
+    shapes = cgutils.unpack_tuple(builder, ary.shape)
+    strides = cgutils.unpack_tuple(builder, ary.strides)
+    data = ary.data
+
+    indexer = FancyIndexer(context, builder, aryty, ary, index_types, indices)
+    indexer.prepare()
+
+    # Construct output array
+    out_ty = sig.return_type
+    out_shapes = indexer.get_shape()
+
+    out = _empty_nd_impl(context, builder, out_ty, out_shapes)
+    out_data = out.data
+    out_idx = cgutils.alloca_once_value(
+        builder, context.get_constant(types.intp, 0)
+    )
+
+    # Loop on source and copy to destination
+    indices, _ = indexer.begin_loops()
+
+    # No need to check for wraparound, as the indexers all ensure
+    # a positive index is returned.
+    ptr = cgutils.get_item_pointer2(
+        context,
+        builder,
+        data,
+        shapes,
+        strides,
+        aryty.layout,
+        indices,
+        wraparound=False,
+        boundscheck=context.enable_boundscheck,
+    )
+    val = load_item(context, builder, aryty, ptr)
+
+    # Since the destination is C-contiguous, no need for multi-dimensional
+    # indexing.
+    cur = builder.load(out_idx)
+    ptr = builder.gep(out_data, [cur])
+    store_item(context, builder, out_ty, val, ptr)
+    next_idx = cgutils.increment_index(builder, cur)
+    builder.store(next_idx, out_idx)
+
+    indexer.end_loops()
+
+    return impl_ret_new_ref(context, builder, out_ty, out._getvalue())
+
+
+@lower_builtin(operator.getitem, types.Buffer, types.Array)
+def fancy_getitem_array(context, builder, sig, args):
+    """
+    Advanced or basic indexing with an array.
+    """
+    aryty, idxty = sig.args
+    ary, idx = args
+    ary = make_array(aryty)(context, builder, ary)
+    if idxty.ndim == 0:
+        # 0-d array index acts as a basic integer index
+        idxty, idx = normalize_index(context, builder, idxty, idx)
+        res = _getitem_array_generic(
+            context, builder, sig.return_type, aryty, ary, (idxty,), (idx,)
+        )
+        return impl_ret_borrowed(context, builder, sig.return_type, res)
+    else:
+        # Advanced indexing
+        return fancy_getitem(
+            context, builder, sig, args, aryty, ary, (idxty,), (idx,)
+        )
+
+
+def offset_bounds_from_strides(context, builder, arrty, arr, shapes, strides):
+    """
+    Compute a half-open range [lower, upper) of byte offsets from the
+    array's data pointer, that bound the in-memory extent of the array.
+
+    This mimics offset_bounds_from_strides() from
+    numpy/core/src/private/mem_overlap.c
+    """
+    itemsize = arr.itemsize
+    zero = itemsize.type(0)
+    one = zero.type(1)
+    if arrty.layout in "CF":
+        # Array is contiguous: contents are laid out sequentially
+        # starting from arr.data and upwards
+        lower = zero
+        upper = builder.mul(itemsize, arr.nitems)
+    else:
+        # Non-contiguous array: need to examine strides
+        lower = zero
+        upper = zero
+        for i in range(arrty.ndim):
+            # Compute the largest byte offset on this dimension
+            #   max_axis_offset = strides[i] * (shapes[i] - 1)
+            # (shapes[i] == 0 is catered for by the empty array case below)
+            max_axis_offset = builder.mul(
+                strides[i], builder.sub(shapes[i], one)
+            )
+            is_upwards = builder.icmp_signed(">=", max_axis_offset, zero)
+            # Expand either upwards or downwards depending on stride
+            upper = builder.select(
+                is_upwards, builder.add(upper, max_axis_offset), upper
+            )
+            lower = builder.select(
+                is_upwards, lower, builder.add(lower, max_axis_offset)
+            )
+        # Return a half-open range
+        upper = builder.add(upper, itemsize)
+        # Adjust for empty arrays
+        is_empty = builder.icmp_signed("==", arr.nitems, zero)
+        upper = builder.select(is_empty, zero, upper)
+        lower = builder.select(is_empty, zero, lower)
+
+    return lower, upper
+
+
+def compute_memory_extents(context, builder, lower, upper, data):
+    """
+    Given [lower, upper) byte offsets and a base data pointer,
+    compute the memory pointer bounds as pointer-sized integers.
+    """
+    data_ptr_as_int = builder.ptrtoint(data, lower.type)
+    start = builder.add(data_ptr_as_int, lower)
+    end = builder.add(data_ptr_as_int, upper)
+    return start, end
+
+
+def get_array_memory_extents(
+    context, builder, arrty, arr, shapes, strides, data
+):
+    """
+    Compute a half-open range [start, end) of pointer-sized integers
+    which fully contain the array data.
+    """
+    lower, upper = offset_bounds_from_strides(
+        context, builder, arrty, arr, shapes, strides
+    )
+    return compute_memory_extents(context, builder, lower, upper, data)
+
+
+def extents_may_overlap(context, builder, a_start, a_end, b_start, b_end):
+    """
+    Whether two memory extents [a_start, a_end) and [b_start, b_end)
+    may overlap.
+    """
+    # Comparisons are unsigned, since we are really comparing pointers
+    may_overlap = builder.and_(
+        builder.icmp_unsigned("<", a_start, b_end),
+        builder.icmp_unsigned("<", b_start, a_end),
+    )
+    return may_overlap
+
+
+def maybe_copy_source(
+    context, builder, use_copy, srcty, src, src_shapes, src_strides, src_data
+):
+    ptrty = src_data.type
+
+    copy_layout = "C"
+    copy_data = cgutils.alloca_once_value(builder, src_data)
+    copy_shapes = src_shapes
+    copy_strides = None  # unneeded for contiguous arrays
+
+    with builder.if_then(use_copy, likely=False):
+        # Allocate temporary scratchpad
+        # XXX: should we use a stack-allocated array for very small
+        # data sizes?
+        allocsize = builder.mul(src.itemsize, src.nitems)
+        data = context.nrt.allocate(builder, allocsize)
+        voidptrty = data.type
+        data = builder.bitcast(data, ptrty)
+        builder.store(data, copy_data)
+
+        # Copy source data into scratchpad
+        intp_t = context.get_value_type(types.intp)
+
+        with cgutils.loop_nest(builder, src_shapes, intp_t) as indices:
+            src_ptr = cgutils.get_item_pointer2(
+                context,
+                builder,
+                src_data,
+                src_shapes,
+                src_strides,
+                srcty.layout,
+                indices,
+            )
+            dest_ptr = cgutils.get_item_pointer2(
+                context,
+                builder,
+                data,
+                copy_shapes,
+                copy_strides,
+                copy_layout,
+                indices,
+            )
+            builder.store(builder.load(src_ptr), dest_ptr)
+
+    def src_getitem(source_indices):
+        src_ptr = cgutils.alloca_once(builder, ptrty)
+        with builder.if_else(use_copy, likely=False) as (if_copy, otherwise):
+            with if_copy:
+                builder.store(
+                    cgutils.get_item_pointer2(
+                        context,
+                        builder,
+                        builder.load(copy_data),
+                        copy_shapes,
+                        copy_strides,
+                        copy_layout,
+                        source_indices,
+                        wraparound=False,
+                    ),
+                    src_ptr,
+                )
+            with otherwise:
+                builder.store(
+                    cgutils.get_item_pointer2(
+                        context,
+                        builder,
+                        src_data,
+                        src_shapes,
+                        src_strides,
+                        srcty.layout,
+                        source_indices,
+                        wraparound=False,
+                    ),
+                    src_ptr,
+                )
+        return load_item(context, builder, srcty, builder.load(src_ptr))
+
+    def src_cleanup():
+        # Deallocate memory
+        with builder.if_then(use_copy, likely=False):
+            data = builder.load(copy_data)
+            data = builder.bitcast(data, voidptrty)
+            context.nrt.free(builder, data)
+
+    return src_getitem, src_cleanup
+
+
+def _bc_adjust_dimension(context, builder, shapes, strides, target_shape):
+    """
+    Preprocess dimension for broadcasting.
+    Returns (shapes, strides) such that the ndim match *target_shape*.
+    When expanding to higher ndim, the returning shapes and strides are
+    prepended with ones and zeros, respectively.
+    When truncating to lower ndim, the shapes are checked (in runtime).
+    All extra dimension must have size of 1.
+    """
+    zero = context.get_constant(types.uintp, 0)
+    one = context.get_constant(types.uintp, 1)
+
+    # Adjust for broadcasting to higher dimension
+    if len(target_shape) > len(shapes):
+        nd_diff = len(target_shape) - len(shapes)
+        # Fill missing shapes with one, strides with zeros
+        shapes = [one] * nd_diff + shapes
+        strides = [zero] * nd_diff + strides
+    # Adjust for broadcasting to lower dimension
+    elif len(target_shape) < len(shapes):
+        # Accepted if all extra dims has shape 1
+        nd_diff = len(shapes) - len(target_shape)
+        dim_is_one = [
+            builder.icmp_unsigned("==", sh, one) for sh in shapes[:nd_diff]
+        ]
+        accepted = functools.reduce(builder.and_, dim_is_one, cgutils.true_bit)
+        # Check error
+        with builder.if_then(builder.not_(accepted), likely=False):
+            msg = "cannot broadcast source array for assignment"
+            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+        # Truncate extra shapes, strides
+        shapes = shapes[nd_diff:]
+        strides = strides[nd_diff:]
+
+    return shapes, strides
+
+
+def _bc_adjust_shape_strides(context, builder, shapes, strides, target_shape):
+    """
+    Broadcast shapes and strides to target_shape given that their ndim already
+    matches.  For each location where the shape is 1 and does not match the
+    dim for target, it is set to the value at the target and the stride is
+    set to zero.
+    """
+    bc_shapes = []
+    bc_strides = []
+    zero = context.get_constant(types.uintp, 0)
+    one = context.get_constant(types.uintp, 1)
+    # Adjust all mismatching ones in shape
+    mismatch = [
+        builder.icmp_signed("!=", tar, old)
+        for tar, old in zip(target_shape, shapes)
+    ]
+    src_is_one = [builder.icmp_signed("==", old, one) for old in shapes]
+    preds = [builder.and_(x, y) for x, y in zip(mismatch, src_is_one)]
+    bc_shapes = [
+        builder.select(p, tar, old)
+        for p, tar, old in zip(preds, target_shape, shapes)
+    ]
+    bc_strides = [
+        builder.select(p, zero, old) for p, old in zip(preds, strides)
+    ]
+    return bc_shapes, bc_strides
+
+
+def _broadcast_to_shape(context, builder, arrtype, arr, target_shape):
+    """
+    Broadcast the given array to the target_shape.
+    Returns (array_type, array)
+    """
+    # Compute broadcasted shape and strides
+    shapes = cgutils.unpack_tuple(builder, arr.shape)
+    strides = cgutils.unpack_tuple(builder, arr.strides)
+
+    shapes, strides = _bc_adjust_dimension(
+        context, builder, shapes, strides, target_shape
+    )
+    shapes, strides = _bc_adjust_shape_strides(
+        context, builder, shapes, strides, target_shape
+    )
+    new_arrtype = arrtype.copy(ndim=len(target_shape), layout="A")
+    # Create new view
+    new_arr = make_array(new_arrtype)(context, builder)
+    populate_array(
+        new_arr,
+        data=arr.data,
+        shape=cgutils.pack_array(builder, shapes),
+        strides=cgutils.pack_array(builder, strides),
+        itemsize=arr.itemsize,
+        meminfo=arr.meminfo,
+        parent=arr.parent,
+    )
+    return new_arrtype, new_arr
+
+
+@intrinsic
+def _numpy_broadcast_to(typingctx, array, shape):
+    ret = array.copy(ndim=shape.count, layout="A", readonly=True)
+    sig = ret(array, shape)
+
+    def codegen(context, builder, sig, args):
+        src, shape_ = args
+        srcty = sig.args[0]
+
+        src = make_array(srcty)(context, builder, src)
+        shape_ = cgutils.unpack_tuple(builder, shape_)
+        _, dest = _broadcast_to_shape(
+            context,
+            builder,
+            srcty,
+            src,
+            shape_,
+        )
+
+        # Hack to get np.broadcast_to to return a read-only array
+        setattr(
+            dest,
+            "parent",
+            Constant(
+                context.get_value_type(dest._datamodel.get_type("parent")), None
+            ),
+        )
+
+        res = dest._getvalue()
+        return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+    return sig, codegen
+
+
+@intrinsic
+def get_readonly_array(typingctx, arr):
+    # returns a copy of arr which is readonly
+    ret = arr.copy(readonly=True)
+    sig = ret(arr)
+
+    def codegen(context, builder, sig, args):
+        [src] = args
+        srcty = sig.args[0]
+
+        dest = make_array(srcty)(context, builder, src)
+        # Hack to return a read-only array
+        dest.parent = cgutils.get_null_value(dest.parent.type)
+        res = dest._getvalue()
+        return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+    return sig, codegen
+
+
+@register_jitable
+def _can_broadcast(array, dest_shape):
+    src_shape = array.shape
+    src_ndim = len(src_shape)
+    dest_ndim = len(dest_shape)
+    if src_ndim > dest_ndim:
+        raise ValueError(
+            "input operand has more dimensions than allowed "
+            "by the axis remapping"
+        )
+    for size in dest_shape:
+        if size < 0:
+            raise ValueError(
+                "all elements of broadcast shape must be non-negative"
+            )
+
+    # based on _broadcast_onto function in numba/np/npyimpl.py
+    src_index = 0
+    dest_index = dest_ndim - src_ndim
+    while src_index < src_ndim:
+        src_dim = src_shape[src_index]
+        dest_dim = dest_shape[dest_index]
+        # possible cases for (src_dim, dest_dim):
+        #  * (1, 1)   -> Ok
+        #  * (>1, 1)  -> Error!
+        #  * (>1, >1) -> src_dim == dest_dim else error!
+        #  * (1, >1)  -> Ok
+        if src_dim == dest_dim or src_dim == 1:
+            src_index += 1
+            dest_index += 1
+        else:
+            raise ValueError(
+                "operands could not be broadcast together with remapped shapes"
+            )
+
+
+def _default_broadcast_to_impl(array, shape):
+    array = np.asarray(array)
+    _can_broadcast(array, shape)
+    return _numpy_broadcast_to(array, shape)
+
+
+@overload(np.broadcast_to)
+def numpy_broadcast_to(array, shape):
+    if not type_can_asarray(array):
+        raise errors.TypingError(
+            'The first argument "array" must be array-like'
+        )
+
+    if isinstance(shape, types.Integer):
+
+        def impl(array, shape):
+            return np.broadcast_to(array, (shape,))
+
+        return impl
+
+    elif isinstance(shape, types.UniTuple):
+        if not isinstance(shape.dtype, types.Integer):
+            msg = 'The second argument "shape" must be a tuple of integers'
+            raise errors.TypingError(msg)
+        return _default_broadcast_to_impl
+
+    elif isinstance(shape, types.Tuple) and shape.count > 0:
+        # check if all types are integers
+        if not all([isinstance(typ, types.IntegerLiteral) for typ in shape]):
+            msg = f'"{shape}" object cannot be interpreted as an integer'
+            raise errors.TypingError(msg)
+        return _default_broadcast_to_impl
+    elif isinstance(shape, types.Tuple) and shape.count == 0:
+        is_scalar_array = isinstance(array, types.Array) and array.ndim == 0
+        if type_is_scalar(array) or is_scalar_array:
+
+            def impl(array, shape):  # broadcast_to(array, ())
+                # Array type must be supported by "type_can_asarray"
+                # Quick note that unicode types are not supported!
+                array = np.asarray(array)
+                return get_readonly_array(array)
+
+            return impl
+
+        else:
+            msg = "Cannot broadcast a non-scalar to a scalar array"
+            raise errors.TypingError(msg)
+    else:
+        msg = (
+            'The argument "shape" must be a tuple or an integer. Got %s' % shape
+        )
+        raise errors.TypingError(msg)
+
+
+@register_jitable
+def numpy_broadcast_shapes_list(r, m, shape):
+    for i in range(len(shape)):
+        k = m - len(shape) + i
+        tmp = shape[i]
+        if tmp < 0:
+            raise ValueError("negative dimensions are not allowed")
+        if tmp == 1:
+            continue
+        if r[k] == 1:
+            r[k] = tmp
+        elif r[k] != tmp:
+            raise ValueError(
+                "shape mismatch: objects cannot be broadcast to a single shape"
+            )
+
+
+@overload(np.broadcast_shapes)
+def ol_numpy_broadcast_shapes(*args):
+    # Based on https://github.com/numpy/numpy/blob/f702b26fff3271ba6a6ba29a021fc19051d1f007/numpy/core/src/multiarray/iterators.c#L1129-L1212  # noqa
+    for idx, arg in enumerate(args):
+        is_int = isinstance(arg, types.Integer)
+        is_int_tuple = isinstance(arg, types.UniTuple) and isinstance(
+            arg.dtype, types.Integer
+        )
+        is_empty_tuple = isinstance(arg, types.Tuple) and len(arg.types) == 0
+        if not (is_int or is_int_tuple or is_empty_tuple):
+            msg = (
+                f"Argument {idx} must be either an int or tuple[int]. Got {arg}"
+            )
+            raise errors.TypingError(msg)
+
+    # discover the number of dimensions
+    m = 0
+    for arg in args:
+        if isinstance(arg, types.Integer):
+            m = max(m, 1)
+        elif isinstance(arg, types.BaseTuple):
+            m = max(m, len(arg))
+
+    if m == 0:
+        return lambda *args: ()
+    else:
+        tup_init = (1,) * m
+
+        def impl(*args):
+            # propagate args
+            r = [1] * m
+            tup = tup_init
+            for arg in literal_unroll(args):
+                if isinstance(arg, tuple) and len(arg) > 0:
+                    numpy_broadcast_shapes_list(r, m, arg)
+                elif isinstance(arg, int):
+                    numpy_broadcast_shapes_list(r, m, (arg,))
+            for idx, elem in enumerate(r):
+                tup = tuple_setitem(tup, idx, elem)
+            return tup
+
+        return impl
+
+
+@overload(np.broadcast_arrays)
+def numpy_broadcast_arrays(*args):
+    for idx, arg in enumerate(args):
+        if not type_can_asarray(arg):
+            raise errors.TypingError(f'Argument "{idx}" must be array-like')
+
+    unified_dtype = None
+    dt = None
+    for arg in args:
+        if isinstance(arg, (types.Array, types.BaseTuple)):
+            dt = arg.dtype
+        else:
+            dt = arg
+
+        if unified_dtype is None:
+            unified_dtype = dt
+        elif unified_dtype != dt:
+            raise errors.TypingError(
+                "Mismatch of argument types. Numba cannot "
+                "broadcast arrays with different types. "
+                f"Got {args}"
+            )
+
+    # number of dimensions
+    m = 0
+    for idx, arg in enumerate(args):
+        if isinstance(arg, types.ArrayCompatible):
+            m = max(m, arg.ndim)
+        elif isinstance(arg, (types.Number, types.Boolean, types.BaseTuple)):
+            m = max(m, 1)
+        else:
+            raise errors.TypingError(f"Unhandled type {arg}")
+
+    tup_init = (0,) * m
+
+    def impl(*args):
+        # find out the output shape
+        # we can't call np.broadcast_shapes here since args may have arrays
+        # with different shapes and it is not possible to create a list
+        # with those shapes dynamically
+        shape = [1] * m
+        for array in literal_unroll(args):
+            numpy_broadcast_shapes_list(shape, m, np.asarray(array).shape)
+
+        tup = tup_init
+
+        for i in range(m):
+            tup = tuple_setitem(tup, i, shape[i])
+
+        # numpy checks if the input arrays have the same shape as `shape`
+        outs = []
+        for array in literal_unroll(args):
+            outs.append(np.broadcast_to(np.asarray(array), tup))
+        return outs
+
+    return impl
+
+
+def raise_with_shape_context(src_shapes, index_shape):
+    """Targets should implement this if they wish to specialize the error
+    handling/messages. The overload implementation takes two tuples as arguments
+    and should raise a ValueError."""
+    raise NotImplementedError
+
+
+@overload(raise_with_shape_context, target="generic")
+def ol_raise_with_shape_context_generic(src_shapes, index_shape):
+    # This overload is for a "generic" target, which makes no assumption about
+    # the NRT or string support, but does assume exceptions can be raised.
+    if (
+        isinstance(src_shapes, types.UniTuple)
+        and isinstance(index_shape, types.UniTuple)
+        and src_shapes.dtype == index_shape.dtype
+        and isinstance(src_shapes.dtype, types.Integer)
+    ):
+
+        def impl(src_shapes, index_shape):
+            raise ValueError("cannot assign slice from input of different size")
+
+        return impl
+
+
+@overload(raise_with_shape_context, target="CPU")
+def ol_raise_with_shape_context_cpu(src_shapes, index_shape):
+    if (
+        isinstance(src_shapes, types.UniTuple)
+        and isinstance(index_shape, types.UniTuple)
+        and src_shapes.dtype == index_shape.dtype
+        and isinstance(src_shapes.dtype, types.Integer)
+    ):
+
+        def impl(src_shapes, index_shape):
+            if len(src_shapes) == 1:
+                shape_str = f"({src_shapes[0]},)"
+            else:
+                shape_str = f"({', '.join([str(x) for x in src_shapes])})"
+            if len(index_shape) == 1:
+                index_str = f"({index_shape[0]},)"
+            else:
+                index_str = f"({', '.join([str(x) for x in index_shape])})"
+            msg = (
+                f"cannot assign slice of shape {shape_str} from input of "
+                f"shape {index_str}"
+            )
+            raise ValueError(msg)
+
+        return impl
+
+
+def fancy_setslice(context, builder, sig, args, index_types, indices):
+    """
+    Implement slice assignment for arrays.  This implementation works for
+    basic as well as fancy indexing, since there's no functional difference
+    between the two for indexed assignment.
+    """
+    aryty, _, srcty = sig.args
+    ary, _, src = args
+
+    ary = make_array(aryty)(context, builder, ary)
+    dest_shapes = cgutils.unpack_tuple(builder, ary.shape)
+    dest_strides = cgutils.unpack_tuple(builder, ary.strides)
+    dest_data = ary.data
+
+    indexer = FancyIndexer(context, builder, aryty, ary, index_types, indices)
+    indexer.prepare()
+
+    def raise_shape_mismatch_error(context, builder, src_shapes, index_shape):
+        # This acts as the "trampoline" to raise a ValueError in the case
+        # of the source and destination shapes mismatch at runtime. It resolves
+        # the public overload stub `raise_with_shape_context`
+        fnty = context.typing_context.resolve_value_type(
+            raise_with_shape_context
+        )
+        argtys = (
+            types.UniTuple(types.int64, len(src_shapes)),
+            types.UniTuple(types.int64, len(index_shape)),
+        )
+        raise_sig = fnty.get_call_type(context.typing_context, argtys, {})
+        func = context.get_function(fnty, raise_sig)
+        func(
+            builder,
+            (
+                context.make_tuple(builder, raise_sig.args[0], src_shapes),
+                context.make_tuple(builder, raise_sig.args[1], index_shape),
+            ),
+        )
+
+    if isinstance(srcty, types.Buffer):
+        # Source is an array
+        src_dtype = srcty.dtype
+        index_shape = indexer.get_shape()
+        src = make_array(srcty)(context, builder, src)
+        # Broadcast source array to shape
+        srcty, src = _broadcast_to_shape(
+            context, builder, srcty, src, index_shape
+        )
+        src_shapes = cgutils.unpack_tuple(builder, src.shape)
+        src_strides = cgutils.unpack_tuple(builder, src.strides)
+        src_data = src.data
+
+        # Check shapes are equal
+        shape_error = cgutils.false_bit
+        assert len(index_shape) == len(src_shapes)
+
+        for u, v in zip(src_shapes, index_shape):
+            shape_error = builder.or_(
+                shape_error, builder.icmp_signed("!=", u, v)
+            )
+
+        with builder.if_then(shape_error, likely=False):
+            raise_shape_mismatch_error(
+                context, builder, src_shapes, index_shape
+            )
+
+        # Check for array overlap
+        src_start, src_end = get_array_memory_extents(
+            context, builder, srcty, src, src_shapes, src_strides, src_data
+        )
+
+        dest_lower, dest_upper = indexer.get_offset_bounds(
+            dest_strides, ary.itemsize
+        )
+        dest_start, dest_end = compute_memory_extents(
+            context, builder, dest_lower, dest_upper, dest_data
+        )
+
+        use_copy = extents_may_overlap(
+            context, builder, src_start, src_end, dest_start, dest_end
+        )
+
+        src_getitem, src_cleanup = maybe_copy_source(
+            context,
+            builder,
+            use_copy,
+            srcty,
+            src,
+            src_shapes,
+            src_strides,
+            src_data,
+        )
+
+    elif isinstance(srcty, types.Sequence):
+        src_dtype = srcty.dtype
+
+        # Check shape is equal to sequence length
+        index_shape = indexer.get_shape()
+        assert len(index_shape) == 1
+        len_impl = context.get_function(len, signature(types.intp, srcty))
+        seq_len = len_impl(builder, (src,))
+
+        shape_error = builder.icmp_signed("!=", index_shape[0], seq_len)
+
+        with builder.if_then(shape_error, likely=False):
+            raise_shape_mismatch_error(
+                context, builder, (seq_len,), (index_shape[0],)
+            )
+
+        def src_getitem(source_indices):
+            (idx,) = source_indices
+            getitem_impl = context.get_function(
+                operator.getitem,
+                signature(src_dtype, srcty, types.intp),
+            )
+            return getitem_impl(builder, (src, idx))
+
+        def src_cleanup():
+            pass
+
+    else:
+        # Source is a scalar (broadcast or not, depending on destination
+        # shape).
+        src_dtype = srcty
+
+        def src_getitem(source_indices):
+            return src
+
+        def src_cleanup():
+            pass
+
+    zero = context.get_constant(types.uintp, 0)
+    # Loop on destination and copy from source to destination
+    dest_indices, counts = indexer.begin_loops()
+
+    # Source is iterated in natural order
+
+    # Counts represent a counter for the number of times a specified axis
+    # is being accessed, during setitem they are used as source
+    # indices
+    counts = list(counts)
+
+    # We need to artifically introduce the index zero wherever a
+    # newaxis is present within the indexer. These always remain
+    # zero.
+    for i in indexer.newaxes:
+        counts.insert(i, zero)
+
+    source_indices = [c for c in counts if c is not None]
+
+    val = src_getitem(source_indices)
+
+    # Cast to the destination dtype (cross-dtype slice assignment is allowed)
+    val = context.cast(builder, val, src_dtype, aryty.dtype)
+
+    # No need to check for wraparound, as the indexers all ensure
+    # a positive index is returned.
+    dest_ptr = cgutils.get_item_pointer2(
+        context,
+        builder,
+        dest_data,
+        dest_shapes,
+        dest_strides,
+        aryty.layout,
+        dest_indices,
+        wraparound=False,
+        boundscheck=context.enable_boundscheck,
+    )
+    store_item(context, builder, aryty, val, dest_ptr)
+
+    indexer.end_loops()
+
+    src_cleanup()
+
+    return context.get_dummy_value()
+
+
+# ------------------------------------------------------------------------------
+# Shape / layout altering
+
+
+def vararg_to_tuple(context, builder, sig, args):
+    aryty = sig.args[0]
+    dimtys = sig.args[1:]
+    # values
+    ary = args[0]
+    dims = args[1:]
+    # coerce all types to intp
+    dims = [
+        context.cast(builder, val, ty, types.intp)
+        for ty, val in zip(dimtys, dims)
+    ]
+    # make a tuple
+    shape = cgutils.pack_array(builder, dims, dims[0].type)
+
+    shapety = types.UniTuple(dtype=types.intp, count=len(dims))
+    new_sig = typing.signature(sig.return_type, aryty, shapety)
+    new_args = ary, shape
+
+    return new_sig, new_args
+
+
+@lower_builtin("array.transpose", types.Array)
+def array_transpose(context, builder, sig, args):
+    return array_T(context, builder, sig.args[0], args[0])
+
+
+def permute_arrays(axis, shape, strides):
+    if len(axis) != len(set(axis)):
+        raise ValueError("repeated axis in transpose")
+    dim = len(shape)
+    for x in axis:
+        if x >= dim or abs(x) > dim:
+            raise ValueError(
+                "axis is out of bounds for array of given dimension"
+            )
+
+    shape[:] = shape[axis]
+    strides[:] = strides[axis]
+
+
+# Transposing an array involves permuting the shape and strides of the array
+# based on the given axes.
+@lower_builtin("array.transpose", types.Array, types.BaseTuple)
+def array_transpose_tuple(context, builder, sig, args):
+    aryty = sig.args[0]
+    ary = make_array(aryty)(context, builder, args[0])
+
+    axisty, axis = sig.args[1], args[1]
+    num_axis, dtype = axisty.count, axisty.dtype
+
+    ll_intp = context.get_value_type(types.intp)
+    ll_ary_size = ir.ArrayType(ll_intp, num_axis)
+
+    # Allocate memory for axes, shapes, and strides arrays.
+    arys = [axis, ary.shape, ary.strides]
+    ll_arys = [cgutils.alloca_once(builder, ll_ary_size) for _ in arys]
+
+    # Store axes, shapes, and strides arrays to the allocated memory.
+    for src, dst in zip(arys, ll_arys):
+        builder.store(src, dst)
+
+    np_ary_ty = types.Array(dtype=dtype, ndim=1, layout="C")
+    np_itemsize = context.get_constant(
+        types.intp, context.get_abi_sizeof(ll_intp)
+    )
+
+    # Form NumPy arrays for axes, shapes, and strides arrays.
+    np_arys = [make_array(np_ary_ty)(context, builder) for _ in arys]
+
+    # Roughly, `np_ary = np.array(ll_ary)` for each of axes, shapes, and strides
+    for np_ary, ll_ary in zip(np_arys, ll_arys):
+        populate_array(
+            np_ary,
+            data=builder.bitcast(ll_ary, ll_intp.as_pointer()),
+            shape=[context.get_constant(types.intp, num_axis)],
+            strides=[np_itemsize],
+            itemsize=np_itemsize,
+            meminfo=None,
+        )
+
+    # Pass NumPy arrays formed above to permute_arrays function that permutes
+    # shapes and strides based on axis contents.
+    context.compile_internal(
+        builder,
+        permute_arrays,
+        typing.signature(types.void, np_ary_ty, np_ary_ty, np_ary_ty),
+        [a._getvalue() for a in np_arys],
+    )
+
+    # Make a new array based on permuted shape and strides and return it.
+    ret = make_array(sig.return_type)(context, builder)
+    populate_array(
+        ret,
+        data=ary.data,
+        shape=builder.load(ll_arys[1]),
+        strides=builder.load(ll_arys[2]),
+        itemsize=ary.itemsize,
+        meminfo=ary.meminfo,
+        parent=ary.parent,
+    )
+    res = ret._getvalue()
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin("array.transpose", types.Array, types.VarArg(types.Any))
+def array_transpose_vararg(context, builder, sig, args):
+    new_sig, new_args = vararg_to_tuple(context, builder, sig, args)
+    return array_transpose_tuple(context, builder, new_sig, new_args)
+
+
+@overload(np.transpose)
+def numpy_transpose(a, axes=None):
+    if isinstance(a, types.BaseTuple):
+        raise errors.TypingError("np.transpose does not accept tuples")
+
+    if axes is None:
+
+        def np_transpose_impl(a, axes=None):
+            return a.transpose()
+    else:
+
+        def np_transpose_impl(a, axes=None):
+            return a.transpose(axes)
+
+    return np_transpose_impl
+
+
+@lower_getattr(types.Array, "T")
+def array_T(context, builder, typ, value):
+    if typ.ndim <= 1:
+        res = value
+    else:
+        ary = make_array(typ)(context, builder, value)
+        ret = make_array(typ)(context, builder)
+        shapes = cgutils.unpack_tuple(builder, ary.shape, typ.ndim)
+        strides = cgutils.unpack_tuple(builder, ary.strides, typ.ndim)
+        populate_array(
+            ret,
+            data=ary.data,
+            shape=cgutils.pack_array(builder, shapes[::-1]),
+            strides=cgutils.pack_array(builder, strides[::-1]),
+            itemsize=ary.itemsize,
+            meminfo=ary.meminfo,
+            parent=ary.parent,
+        )
+        res = ret._getvalue()
+    return impl_ret_borrowed(context, builder, typ, res)
+
+
+@overload(np.logspace)
+def numpy_logspace(start, stop, num=50):
+    if not isinstance(start, types.Number):
+        raise errors.TypingError('The first argument "start" must be a number')
+    if not isinstance(stop, types.Number):
+        raise errors.TypingError('The second argument "stop" must be a number')
+    if not isinstance(num, (int, types.Integer)):
+        raise errors.TypingError('The third argument "num" must be an integer')
+
+    def impl(start, stop, num=50):
+        y = np.linspace(start, stop, num)
+        return np.power(10.0, y)
+
+    return impl
+
+
+@overload(np.geomspace)
+def numpy_geomspace(start, stop, num=50):
+    if not isinstance(start, types.Number):
+        msg = 'The argument "start" must be a number'
+        raise errors.TypingError(msg)
+
+    if not isinstance(stop, types.Number):
+        msg = 'The argument "stop" must be a number'
+        raise errors.TypingError(msg)
+
+    if not isinstance(num, (int, types.Integer)):
+        msg = 'The argument "num" must be an integer'
+        raise errors.TypingError(msg)
+
+    if any(isinstance(arg, types.Complex) for arg in [start, stop]):
+        result_dtype = from_dtype(
+            np.result_type(as_dtype(start), as_dtype(stop), None)
+        )
+
+        def impl(start, stop, num=50):
+            if start == 0 or stop == 0:
+                raise ValueError("Geometric sequence cannot include zero")
+            start = result_dtype(start)
+            stop = result_dtype(stop)
+            if numpy_version < (2, 0):
+                both_imaginary = (start.real == 0) & (stop.real == 0)
+                both_negative = (np.sign(start) == -1) & (np.sign(stop) == -1)
+                out_sign = 1
+                if both_imaginary:
+                    start = start.imag
+                    stop = stop.imag
+                    out_sign = 1j
+                if both_negative:
+                    start = -start
+                    stop = -stop
+                    out_sign = -out_sign
+            else:
+                out_sign = np.sign(start)
+                start /= out_sign
+                stop /= out_sign
+
+            logstart = np.log10(start)
+            logstop = np.log10(stop)
+            result = np.logspace(logstart, logstop, num)
+            # Make sure the endpoints match the start and stop arguments.
+            # This is necessary because np.exp(np.log(x)) is not necessarily
+            # equal to x.
+            if num > 0:
+                result[0] = start
+                if num > 1:
+                    result[-1] = stop
+            return out_sign * result
+
+    else:
+
+        def impl(start, stop, num=50):
+            if start == 0 or stop == 0:
+                raise ValueError("Geometric sequence cannot include zero")
+            both_negative = (np.sign(start) == -1) & (np.sign(stop) == -1)
+            out_sign = 1
+            if both_negative:
+                start = -start
+                stop = -stop
+                out_sign = -out_sign
+            logstart = np.log10(start)
+            logstop = np.log10(stop)
+            result = np.logspace(logstart, logstop, num)
+            # Make sure the endpoints match the start and stop arguments.
+            # This is necessary because np.exp(np.log(x)) is not necessarily
+            # equal to x.
+            if num > 0:
+                result[0] = start
+                if num > 1:
+                    result[-1] = stop
+            return out_sign * result
+
+    return impl
+
+
+@overload(np.rot90)
+def numpy_rot90(m, k=1):
+    # supporting axes argument it needs to be included in np.flip
+    if not isinstance(k, (int, types.Integer)):
+        raise errors.TypingError('The second argument "k" must be an integer')
+    if not isinstance(m, types.Array):
+        raise errors.TypingError('The first argument "m" must be an array')
+
+    if m.ndim < 2:
+        raise errors.NumbaValueError("Input must be >= 2-d.")
+
+    def impl(m, k=1):
+        k = k % 4
+        if k == 0:
+            return m[:]
+        elif k == 1:
+            return np.swapaxes(np.fliplr(m), 0, 1)
+        elif k == 2:
+            return np.flipud(np.fliplr(m))
+        elif k == 3:
+            return np.fliplr(np.swapaxes(m, 0, 1))
+        else:
+            raise AssertionError  # unreachable
+
+    return impl
+
+
+def _attempt_nocopy_reshape(
+    context, builder, aryty, ary, newnd, newshape, newstrides
+):
+    """
+    Call into Numba_attempt_nocopy_reshape() for the given array type
+    and instance, and the specified new shape.
+
+    Return value is non-zero if successful, and the array pointed to
+    by *newstrides* will be filled up with the computed results.
+    """
+    ll_intp = context.get_value_type(types.intp)
+    ll_intp_star = ll_intp.as_pointer()
+    ll_intc = context.get_value_type(types.intc)
+    fnty = ir.FunctionType(
+        ll_intc,
+        [
+            # nd, *dims, *strides
+            ll_intp,
+            ll_intp_star,
+            ll_intp_star,
+            # newnd, *newdims, *newstrides
+            ll_intp,
+            ll_intp_star,
+            ll_intp_star,
+            # itemsize, is_f_order
+            ll_intp,
+            ll_intc,
+        ],
+    )
+    fn = cgutils.get_or_insert_function(
+        builder.module, fnty, "numba_attempt_nocopy_reshape"
+    )
+
+    nd = ll_intp(aryty.ndim)
+    shape = cgutils.gep_inbounds(builder, ary._get_ptr_by_name("shape"), 0, 0)
+    strides = cgutils.gep_inbounds(
+        builder, ary._get_ptr_by_name("strides"), 0, 0
+    )
+    newnd = ll_intp(newnd)
+    newshape = cgutils.gep_inbounds(builder, newshape, 0, 0)
+    newstrides = cgutils.gep_inbounds(builder, newstrides, 0, 0)
+    is_f_order = ll_intc(0)
+    res = builder.call(
+        fn,
+        [
+            nd,
+            shape,
+            strides,
+            newnd,
+            newshape,
+            newstrides,
+            ary.itemsize,
+            is_f_order,
+        ],
+    )
+    return res
+
+
+def normalize_reshape_value(origsize, shape):
+    num_neg_value = 0
+    known_size = 1
+    for ax, s in enumerate(shape):
+        if s < 0:
+            num_neg_value += 1
+            neg_ax = ax
+        else:
+            known_size *= s
+
+    if num_neg_value == 0:
+        if origsize != known_size:
+            raise ValueError("total size of new array must be unchanged")
+
+    elif num_neg_value == 1:
+        # Infer negative dimension
+        if known_size == 0:
+            inferred = 0
+            ok = origsize == 0
+        else:
+            inferred = origsize // known_size
+            ok = origsize % known_size == 0
+        if not ok:
+            raise ValueError("total size of new array must be unchanged")
+        shape[neg_ax] = inferred
+
+    else:
+        raise ValueError("multiple negative shape values")
+
+
+@lower_builtin("array.reshape", types.Array, types.BaseTuple)
+def array_reshape(context, builder, sig, args):
+    aryty = sig.args[0]
+    retty = sig.return_type
+
+    shapety = sig.args[1]
+    shape = args[1]
+
+    ll_intp = context.get_value_type(types.intp)
+    ll_shape = ir.ArrayType(ll_intp, shapety.count)
+
+    ary = make_array(aryty)(context, builder, args[0])
+
+    # We will change the target shape in this slot
+    # (see normalize_reshape_value() below)
+    newshape = cgutils.alloca_once(builder, ll_shape)
+    builder.store(shape, newshape)
+
+    # Create a shape array pointing to the value of newshape.
+    # (roughly, `shape_ary = np.array(ary.shape)`)
+    shape_ary_ty = types.Array(dtype=shapety.dtype, ndim=1, layout="C")
+    shape_ary = make_array(shape_ary_ty)(context, builder)
+    shape_itemsize = context.get_constant(
+        types.intp, context.get_abi_sizeof(ll_intp)
+    )
+    populate_array(
+        shape_ary,
+        data=builder.bitcast(newshape, ll_intp.as_pointer()),
+        shape=[context.get_constant(types.intp, shapety.count)],
+        strides=[shape_itemsize],
+        itemsize=shape_itemsize,
+        meminfo=None,
+    )
+
+    # Compute the original array size
+    size = ary.nitems
+
+    # Call our normalizer which will fix the shape array in case of negative
+    # shape value
+    context.compile_internal(
+        builder,
+        normalize_reshape_value,
+        typing.signature(types.void, types.uintp, shape_ary_ty),
+        [size, shape_ary._getvalue()],
+    )
+
+    # Perform reshape (nocopy)
+    newnd = shapety.count
+    newstrides = cgutils.alloca_once(builder, ll_shape)
+
+    ok = _attempt_nocopy_reshape(
+        context, builder, aryty, ary, newnd, newshape, newstrides
+    )
+    fail = builder.icmp_unsigned("==", ok, ok.type(0))
+
+    with builder.if_then(fail):
+        msg = "incompatible shape for array"
+        context.call_conv.return_user_exc(builder, NotImplementedError, (msg,))
+
+    ret = make_array(retty)(context, builder)
+    populate_array(
+        ret,
+        data=ary.data,
+        shape=builder.load(newshape),
+        strides=builder.load(newstrides),
+        itemsize=ary.itemsize,
+        meminfo=ary.meminfo,
+        parent=ary.parent,
+    )
+    res = ret._getvalue()
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin("array.reshape", types.Array, types.VarArg(types.Any))
+def array_reshape_vararg(context, builder, sig, args):
+    new_sig, new_args = vararg_to_tuple(context, builder, sig, args)
+    return array_reshape(context, builder, new_sig, new_args)
+
+
+if numpy_version < (2, 1):
+
+    @overload(np.reshape)
+    def np_reshape(a, newshape):
+        def np_reshape_impl(a, newshape):
+            return a.reshape(newshape)
+
+        return np_reshape_impl
+else:
+
+    @overload(np.reshape)
+    def np_reshape(a, shape):
+        def np_reshape_impl(a, shape):
+            return a.reshape(shape)
+
+        return np_reshape_impl
+
+
+@overload(np.resize)
+def numpy_resize(a, new_shape):
+    if not type_can_asarray(a):
+        msg = 'The argument "a" must be array-like'
+        raise errors.TypingError(msg)
+
+    if not (
+        (
+            isinstance(new_shape, types.UniTuple)
+            and isinstance(new_shape.dtype, types.Integer)
+        )
+        or isinstance(new_shape, types.Integer)
+    ):
+        msg = (
+            'The argument "new_shape" must be an integer or a tuple of integers'
+        )
+        raise errors.TypingError(msg)
+
+    def impl(a, new_shape):
+        a = np.asarray(a)
+        a = np.ravel(a)
+
+        if isinstance(new_shape, tuple):
+            new_size = 1
+            for dim_length in np.asarray(new_shape):
+                new_size *= dim_length
+                if dim_length < 0:
+                    msg = "All elements of `new_shape` must be non-negative"
+                    raise ValueError(msg)
+        else:
+            if new_shape < 0:
+                msg2 = "All elements of `new_shape` must be non-negative"
+                raise ValueError(msg2)
+            new_size = new_shape
+
+        if a.size == 0:
+            return np.zeros(new_shape).astype(a.dtype)
+
+        repeats = -(-new_size // a.size)  # ceil division
+        res = a
+        for i in range(repeats - 1):
+            res = np.concatenate((res, a))
+        res = res[:new_size]
+
+        return np.reshape(res, new_shape)
+
+    return impl
+
+
+@overload(np.append)
+def np_append(arr, values, axis=None):
+    if not type_can_asarray(arr):
+        raise errors.TypingError('The first argument "arr" must be array-like')
+
+    if not type_can_asarray(values):
+        raise errors.TypingError(
+            'The second argument "values" must be array-like'
+        )
+
+    if is_nonelike(axis):
+
+        def impl(arr, values, axis=None):
+            arr = np.ravel(np.asarray(arr))
+            values = np.ravel(np.asarray(values))
+            return np.concatenate((arr, values))
+    else:
+        if not isinstance(axis, types.Integer):
+            raise errors.TypingError(
+                'The third argument "axis" must be an integer'
+            )
+
+        def impl(arr, values, axis=None):
+            return np.concatenate((arr, values), axis=axis)
+
+    return impl
+
+
+@lower_builtin("array.ravel", types.Array)
+def array_ravel(context, builder, sig, args):
+    # Only support no argument version (default order='C')
+    def imp_nocopy(ary):
+        """No copy version"""
+        return ary.reshape(ary.size)
+
+    def imp_copy(ary):
+        """Copy version"""
+        return ary.flatten()
+
+    # If the input array is C layout already, use the nocopy version
+    if sig.args[0].layout == "C":
+        imp = imp_nocopy
+    # otherwise, use flatten under-the-hood
+    else:
+        imp = imp_copy
+
+    res = context.compile_internal(builder, imp, sig, args)
+    res = impl_ret_new_ref(context, builder, sig.return_type, res)
+    return res
+
+
+@lower_builtin(np.ravel, types.Array)
+def np_ravel(context, builder, sig, args):
+    def np_ravel_impl(a):
+        return a.ravel()
+
+    return context.compile_internal(builder, np_ravel_impl, sig, args)
+
+
+@lower_builtin("array.flatten", types.Array)
+def array_flatten(context, builder, sig, args):
+    # Only support flattening to C layout currently.
+    def imp(ary):
+        return ary.copy().reshape(ary.size)
+
+    res = context.compile_internal(builder, imp, sig, args)
+    res = impl_ret_new_ref(context, builder, sig.return_type, res)
+    return res
+
+
+@register_jitable
+def _np_clip_impl(a, a_min, a_max, out):
+    # Both a_min and a_max are numpy arrays
+    ret = np.empty_like(a) if out is None else out
+    a_b, a_min_b, a_max_b = np.broadcast_arrays(a, a_min, a_max)
+    for index in np.ndindex(a_b.shape):
+        val_a = a_b[index]
+        val_a_min = a_min_b[index]
+        val_a_max = a_max_b[index]
+        ret[index] = min(max(val_a, val_a_min), val_a_max)
+
+    return ret
+
+
+@register_jitable
+def _np_clip_impl_none(a, b, use_min, out):
+    for index in np.ndindex(a.shape):
+        val_a = a[index]
+        val_b = b[index]
+        if use_min:
+            out[index] = min(val_a, val_b)
+        else:
+            out[index] = max(val_a, val_b)
+    return out
+
+
+@overload(np.clip)
+def np_clip(a, a_min, a_max, out=None):
+    if not type_can_asarray(a):
+        raise errors.TypingError('The argument "a" must be array-like')
+
+    if not isinstance(a_min, types.NoneType) and not type_can_asarray(a_min):
+        raise errors.TypingError(
+            ('The argument "a_min" must be a number or an array-like')
+        )
+
+    if not isinstance(a_max, types.NoneType) and not type_can_asarray(a_max):
+        raise errors.TypingError(
+            'The argument "a_max" must be a number or an array-like'
+        )
+
+    if not (isinstance(out, types.Array) or is_nonelike(out)):
+        msg = 'The argument "out" must be an array if it is provided'
+        raise errors.TypingError(msg)
+
+    # TODO: support scalar a (issue #3469)
+    a_min_is_none = a_min is None or isinstance(a_min, types.NoneType)
+    a_max_is_none = a_max is None or isinstance(a_max, types.NoneType)
+
+    if a_min_is_none and a_max_is_none:
+        # Raises value error when both a_min and a_max are None
+        def np_clip_nn(a, a_min, a_max, out=None):
+            raise ValueError("array_clip: must set either max or min")
+
+        return np_clip_nn
+
+    a_min_is_scalar = isinstance(a_min, types.Number)
+    a_max_is_scalar = isinstance(a_max, types.Number)
+
+    if a_min_is_scalar and a_max_is_scalar:
+
+        def np_clip_ss(a, a_min, a_max, out=None):
+            # a_min and a_max are scalars
+            # since their shape will be empty
+            # so broadcasting is not needed at all
+            ret = np.empty_like(a) if out is None else out
+            for index in np.ndindex(a.shape):
+                val_a = a[index]
+                ret[index] = min(max(val_a, a_min), a_max)
+
+            return ret
+
+        return np_clip_ss
+    elif a_min_is_scalar and not a_max_is_scalar:
+        if a_max_is_none:
+
+            def np_clip_sn(a, a_min, a_max, out=None):
+                # a_min is a scalar
+                # since its shape will be empty
+                # so broadcasting is not needed at all
+                ret = np.empty_like(a) if out is None else out
+                for index in np.ndindex(a.shape):
+                    val_a = a[index]
+                    ret[index] = max(val_a, a_min)
+
+                return ret
+
+            return np_clip_sn
+        else:
+
+            def np_clip_sa(a, a_min, a_max, out=None):
+                # a_min is a scalar
+                # since its shape will be empty
+                # broadcast it to shape of a
+                # by using np.full_like
+                a_min_full = np.full_like(a, a_min)
+                return _np_clip_impl(a, a_min_full, a_max, out)
+
+            return np_clip_sa
+    elif not a_min_is_scalar and a_max_is_scalar:
+        if a_min_is_none:
+
+            def np_clip_ns(a, a_min, a_max, out=None):
+                # a_max is a scalar
+                # since its shape will be empty
+                # so broadcasting is not needed at all
+                ret = np.empty_like(a) if out is None else out
+                for index in np.ndindex(a.shape):
+                    val_a = a[index]
+                    ret[index] = min(val_a, a_max)
+
+                return ret
+
+            return np_clip_ns
+        else:
+
+            def np_clip_as(a, a_min, a_max, out=None):
+                # a_max is a scalar
+                # since its shape will be empty
+                # broadcast it to shape of a
+                # by using np.full_like
+                a_max_full = np.full_like(a, a_max)
+                return _np_clip_impl(a, a_min, a_max_full, out)
+
+            return np_clip_as
+    else:
+        # Case where exactly one of a_min or a_max is None
+        if a_min_is_none:
+
+            def np_clip_na(a, a_min, a_max, out=None):
+                # a_max is a numpy array but a_min is None
+                ret = np.empty_like(a) if out is None else out
+                a_b, a_max_b = np.broadcast_arrays(a, a_max)
+                return _np_clip_impl_none(a_b, a_max_b, True, ret)
+
+            return np_clip_na
+        elif a_max_is_none:
+
+            def np_clip_an(a, a_min, a_max, out=None):
+                # a_min is a numpy array but a_max is None
+                ret = np.empty_like(a) if out is None else out
+                a_b, a_min_b = np.broadcast_arrays(a, a_min)
+                return _np_clip_impl_none(a_b, a_min_b, False, ret)
+
+            return np_clip_an
+        else:
+
+            def np_clip_aa(a, a_min, a_max, out=None):
+                # Both a_min and a_max are clearly arrays
+                # because none of the above branches
+                # returned
+                return _np_clip_impl(a, a_min, a_max, out)
+
+            return np_clip_aa
+
+
+@overload_method(types.Array, "clip")
+def array_clip(a, a_min=None, a_max=None, out=None):
+    def impl(a, a_min=None, a_max=None, out=None):
+        return np.clip(a, a_min, a_max, out)
+
+    return impl
+
+
+def _change_dtype(context, builder, oldty, newty, ary):
+    """
+    Attempt to fix up *ary* for switching from *oldty* to *newty*.
+
+    See Numpy's array_descr_set()
+    (np/core/src/multiarray/getset.c).
+    Attempt to fix the array's shape and strides for a new dtype.
+    False is returned on failure, True on success.
+    """
+    assert oldty.ndim == newty.ndim
+    assert oldty.layout == newty.layout
+
+    new_layout = ord(newty.layout)
+    any_layout = ord("A")
+    c_layout = ord("C")
+    f_layout = ord("F")
+
+    int8 = types.int8
+
+    def imp(nd, dims, strides, old_itemsize, new_itemsize, layout):
+        # Attempt to update the layout due to limitation of the numba
+        # type system.
+        if layout == any_layout:
+            # Test rightmost stride to be contiguous
+            if strides[-1] == old_itemsize:
+                # Process this as if it is C contiguous
+                layout = int8(c_layout)
+            # Test leftmost stride to be F contiguous
+            elif strides[0] == old_itemsize:
+                # Process this as if it is F contiguous
+                layout = int8(f_layout)
+
+        if old_itemsize != new_itemsize and (layout == any_layout or nd == 0):
+            return False
+
+        if layout == c_layout:
+            i = nd - 1
+        else:
+            i = 0
+
+        if new_itemsize < old_itemsize:
+            # If it is compatible, increase the size of the dimension
+            # at the end (or at the front if F-contiguous)
+            if (old_itemsize % new_itemsize) != 0:
+                return False
+
+            newdim = old_itemsize // new_itemsize
+            dims[i] *= newdim
+            strides[i] = new_itemsize
+
+        elif new_itemsize > old_itemsize:
+            # Determine if last (or first if F-contiguous) dimension
+            # is compatible
+            bytelength = dims[i] * old_itemsize
+            if (bytelength % new_itemsize) != 0:
+                return False
+
+            dims[i] = bytelength // new_itemsize
+            strides[i] = new_itemsize
+
+        else:
+            # Same item size: nothing to do (this also works for
+            # non-contiguous arrays).
+            pass
+
+        return True
+
+    old_itemsize = context.get_constant(
+        types.intp, get_itemsize(context, oldty)
+    )
+    new_itemsize = context.get_constant(
+        types.intp, get_itemsize(context, newty)
+    )
+
+    nd = context.get_constant(types.intp, newty.ndim)
+    shape_data = cgutils.gep_inbounds(
+        builder, ary._get_ptr_by_name("shape"), 0, 0
+    )
+    strides_data = cgutils.gep_inbounds(
+        builder, ary._get_ptr_by_name("strides"), 0, 0
+    )
+
+    shape_strides_array_type = types.Array(dtype=types.intp, ndim=1, layout="C")
+    arycls = context.make_array(shape_strides_array_type)
+
+    shape_constant = cgutils.pack_array(
+        builder, [context.get_constant(types.intp, newty.ndim)]
+    )
+
+    sizeof_intp = context.get_abi_sizeof(context.get_data_type(types.intp))
+    sizeof_intp = context.get_constant(types.intp, sizeof_intp)
+    strides_constant = cgutils.pack_array(builder, [sizeof_intp])
+
+    shape_ary = arycls(context, builder)
+
+    populate_array(
+        shape_ary,
+        data=shape_data,
+        shape=shape_constant,
+        strides=strides_constant,
+        itemsize=sizeof_intp,
+        meminfo=None,
+    )
+
+    strides_ary = arycls(context, builder)
+    populate_array(
+        strides_ary,
+        data=strides_data,
+        shape=shape_constant,
+        strides=strides_constant,
+        itemsize=sizeof_intp,
+        meminfo=None,
+    )
+
+    shape = shape_ary._getvalue()
+    strides = strides_ary._getvalue()
+    args = [
+        nd,
+        shape,
+        strides,
+        old_itemsize,
+        new_itemsize,
+        context.get_constant(types.int8, new_layout),
+    ]
+
+    sig = signature(
+        types.boolean,
+        types.intp,  # nd
+        shape_strides_array_type,  # dims
+        shape_strides_array_type,  # strides
+        types.intp,  # old_itemsize
+        types.intp,  # new_itemsize
+        types.int8,  # layout
+    )
+
+    res = context.compile_internal(builder, imp, sig, args)
+    update_array_info(newty, ary)
+    res = impl_ret_borrowed(context, builder, sig.return_type, res)
+    return res
+
+
+@overload(np.shape)
+def np_shape(a):
+    if not type_can_asarray(a):
+        raise errors.TypingError("The argument to np.shape must be array-like")
+
+    def impl(a):
+        return np.asarray(a).shape
+
+    return impl
+
+
+@overload(np.size)
+def np_size(a):
+    if not type_can_asarray(a):
+        raise errors.TypingError("The argument to np.size must be array-like")
+
+    def impl(a):
+        return np.asarray(a).size
+
+    return impl
+
+
+# ------------------------------------------------------------------------------
+
+
+@overload(np.unique)
+def np_unique(ar):
+    def np_unique_impl(ar):
+        b = np.sort(ar.ravel())
+        head = list(b[:1])
+        tail = [x for i, x in enumerate(b[1:]) if b[i] != x]
+        return np.array(head + tail)
+
+    return np_unique_impl
+
+
+@overload(np.repeat)
+def np_repeat(a, repeats):
+    # Implementation for repeats being a scalar is a module global function
+    # (see below) because it might be called from the implementation below.
+
+    def np_repeat_impl_repeats_array_like(a, repeats):
+        # implementation if repeats is an array like
+        repeats_array = np.asarray(repeats, dtype=np.int64)
+        # if it is a singleton array, invoke the scalar implementation
+        if repeats_array.shape[0] == 1:
+            return np_repeat_impl_repeats_scaler(a, repeats_array[0])
+        if np.any(repeats_array < 0):
+            raise ValueError("negative dimensions are not allowed")
+        asa = np.asarray(a)
+        aravel = asa.ravel()
+        n = aravel.shape[0]
+        if aravel.shape != repeats_array.shape:
+            raise ValueError("operands could not be broadcast together")
+        to_return = np.empty(np.sum(repeats_array), dtype=asa.dtype)
+        pos = 0
+        for i in range(n):
+            to_return[pos : pos + repeats_array[i]] = aravel[i]
+            pos += repeats_array[i]
+        return to_return
+
+    # type checking
+    if isinstance(
+        a,
+        (
+            types.Array,
+            types.List,
+            types.BaseTuple,
+            types.Number,
+            types.Boolean,
+        ),
+    ):
+        if isinstance(repeats, types.Integer):
+            return np_repeat_impl_repeats_scaler
+        elif isinstance(repeats, (types.Array, types.List)):
+            if isinstance(repeats.dtype, types.Integer):
+                return np_repeat_impl_repeats_array_like
+
+        raise errors.TypingError(
+            "The repeats argument must be an integer "
+            "or an array-like of integer dtype"
+        )
+
+
+@register_jitable
+def np_repeat_impl_repeats_scaler(a, repeats):
+    if repeats < 0:
+        raise ValueError("negative dimensions are not allowed")
+    asa = np.asarray(a)
+    aravel = asa.ravel()
+    n = aravel.shape[0]
+    if repeats == 0:
+        return np.empty(0, dtype=asa.dtype)
+    elif repeats == 1:
+        return np.copy(aravel)
+    else:
+        to_return = np.empty(n * repeats, dtype=asa.dtype)
+        for i in range(n):
+            to_return[i * repeats : (i + 1) * repeats] = aravel[i]
+        return to_return
+
+
+@extending.overload_method(types.Array, "repeat")
+def array_repeat(a, repeats):
+    def array_repeat_impl(a, repeats):
+        return np.repeat(a, repeats)
+
+    return array_repeat_impl
+
+
+@intrinsic
+def _intrin_get_itemsize(tyctx, dtype):
+    """Computes the itemsize of the dtype"""
+    sig = types.intp(dtype)
+
+    def codegen(cgctx, builder, sig, llargs):
+        llty = cgctx.get_data_type(sig.args[0].dtype)
+        llintp = cgctx.get_data_type(sig.return_type)
+        return llintp(cgctx.get_abi_sizeof(llty))
+
+    return sig, codegen
+
+
+def _compatible_view(a, dtype):
+    pass
+
+
+@overload(_compatible_view, target="generic")
+def ol_compatible_view(a, dtype):
+    """Determines if the array and dtype are compatible for forming a view."""
+
+    # NOTE: NumPy 1.23+ uses this check.
+    # Code based on:
+    # https://github.com/numpy/numpy/blob/750ad21258cfc00663586d5a466e24f91b48edc7/numpy/core/src/multiarray/getset.c#L500-L555  # noqa: E501
+    def impl(a, dtype):
+        dtype_size = _intrin_get_itemsize(dtype)
+        if dtype_size != a.itemsize:
+            # catch forbidden cases
+            if a.ndim == 0:
+                msg1 = (
+                    "Changing the dtype of a 0d array is only supported "
+                    "if the itemsize is unchanged"
+                )
+                raise ValueError(msg1)
+            else:
+                # NumPy has a check here for subarray type conversion which
+                # Numba doesn't support
+                pass
+
+            # Resize on last axis only
+            axis = a.ndim - 1
+            p1 = a.shape[axis] != 1
+            p2 = a.size != 0
+            p3 = a.strides[axis] != a.itemsize
+            if p1 and p2 and p3:
+                msg2 = (
+                    "To change to a dtype of a different size, the last "
+                    "axis must be contiguous"
+                )
+                raise ValueError(msg2)
+
+            if dtype_size < a.itemsize:
+                if dtype_size == 0 or a.itemsize % dtype_size != 0:
+                    msg3 = (
+                        "When changing to a smaller dtype, its size must "
+                        "be a divisor of the size of original dtype"
+                    )
+                    raise ValueError(msg3)
+            else:
+                newdim = a.shape[axis] * a.itemsize
+                if newdim % dtype_size != 0:
+                    msg4 = (
+                        "When changing to a larger dtype, its size must be "
+                        "a divisor of the total size in bytes of the last "
+                        "axis of the array."
+                    )
+                    raise ValueError(msg4)
+
+    return impl
+
+
+@lower_builtin("array.view", types.Array, types.DTypeSpec)
+def array_view(context, builder, sig, args):
+    aryty = sig.args[0]
+    retty = sig.return_type
+
+    ary = make_array(aryty)(context, builder, args[0])
+    ret = make_array(retty)(context, builder)
+    # Copy all fields, casting the "data" pointer appropriately
+    fields = set(ret._datamodel._fields)
+    for k in sorted(fields):
+        val = getattr(ary, k)
+        if k == "data":
+            ptrty = ret.data.type
+            ret.data = builder.bitcast(val, ptrty)
+        else:
+            setattr(ret, k, val)
+
+    tyctx = context.typing_context
+    fnty = tyctx.resolve_value_type(_compatible_view)
+    _compatible_view_sig = fnty.get_call_type(tyctx, (*sig.args,), {})
+    impl = context.get_function(fnty, _compatible_view_sig)
+    impl(builder, args)
+
+    ok = _change_dtype(context, builder, aryty, retty, ret)
+    fail = builder.icmp_unsigned("==", ok, Constant(ok.type, 0))
+
+    with builder.if_then(fail):
+        msg = "new type not compatible with array"
+        context.call_conv.return_user_exc(builder, ValueError, (msg,))
+
+    res = ret._getvalue()
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+# ------------------------------------------------------------------------------
+# Array attributes
+
+
+@lower_getattr(types.Array, "dtype")
+def array_dtype(context, builder, typ, value):
+    res = context.get_dummy_value()
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.Array, "shape")
+@lower_getattr(types.MemoryView, "shape")
+def array_shape(context, builder, typ, value):
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+    res = array.shape
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.Array, "strides")
+@lower_getattr(types.MemoryView, "strides")
+def array_strides(context, builder, typ, value):
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+    res = array.strides
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.Array, "ndim")
+@lower_getattr(types.MemoryView, "ndim")
+def array_ndim(context, builder, typ, value):
+    res = context.get_constant(types.intp, typ.ndim)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.Array, "size")
+def array_size(context, builder, typ, value):
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+    res = array.nitems
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.Array, "itemsize")
+@lower_getattr(types.MemoryView, "itemsize")
+def array_itemsize(context, builder, typ, value):
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+    res = array.itemsize
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.Array, "nbytes")
+@lower_getattr(types.MemoryView, "nbytes")
+def array_nbytes(context, builder, typ, value):
+    """
+    nbytes = size * itemsize
+    """
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+    res = builder.mul(array.nitems, array.itemsize)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.MemoryView, "contiguous")
+def array_contiguous(context, builder, typ, value):
+    res = context.get_constant(types.boolean, typ.is_contig)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.MemoryView, "c_contiguous")
+def array_c_contiguous(context, builder, typ, value):
+    res = context.get_constant(types.boolean, typ.is_c_contig)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.MemoryView, "f_contiguous")
+def array_f_contiguous(context, builder, typ, value):
+    res = context.get_constant(types.boolean, typ.is_f_contig)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.MemoryView, "readonly")
+def array_readonly(context, builder, typ, value):
+    res = context.get_constant(types.boolean, not typ.mutable)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+# array.ctypes
+
+
+@lower_getattr(types.Array, "ctypes")
+def array_ctypes(context, builder, typ, value):
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+    # Create new ArrayCType structure
+    act = types.ArrayCTypes(typ)
+    ctinfo = context.make_helper(builder, act)
+    ctinfo.data = array.data
+    ctinfo.meminfo = array.meminfo
+    res = ctinfo._getvalue()
+    return impl_ret_borrowed(context, builder, act, res)
+
+
+@lower_getattr(types.ArrayCTypes, "data")
+def array_ctypes_data(context, builder, typ, value):
+    ctinfo = context.make_helper(builder, typ, value=value)
+    res = ctinfo.data
+    # Convert it to an integer
+    res = builder.ptrtoint(res, context.get_value_type(types.intp))
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_cast(types.ArrayCTypes, types.CPointer)
+@lower_cast(types.ArrayCTypes, types.voidptr)
+def array_ctypes_to_pointer(context, builder, fromty, toty, val):
+    ctinfo = context.make_helper(builder, fromty, value=val)
+    res = ctinfo.data
+    res = builder.bitcast(res, context.get_value_type(toty))
+    return impl_ret_untracked(context, builder, toty, res)
+
+
+def _call_contiguous_check(checker, context, builder, aryty, ary):
+    """Helper to invoke the contiguous checker function on an array
+
+    Args
+    ----
+    checker :
+        ``numba.numpy_supports.is_contiguous``, or
+        ``numba.numpy_supports.is_fortran``.
+    context : target context
+    builder : llvm ir builder
+    aryty : numba type
+    ary : llvm value
+    """
+    ary = make_array(aryty)(context, builder, value=ary)
+    tup_intp = types.UniTuple(types.intp, aryty.ndim)
+    itemsize = context.get_abi_sizeof(context.get_value_type(aryty.dtype))
+    check_sig = signature(types.bool_, tup_intp, tup_intp, types.intp)
+    check_args = [
+        ary.shape,
+        ary.strides,
+        context.get_constant(types.intp, itemsize),
+    ]
+    is_contig = context.compile_internal(
+        builder, checker, check_sig, check_args
+    )
+    return is_contig
+
+
+# array.flags
+
+
+@lower_getattr(types.Array, "flags")
+def array_flags(context, builder, typ, value):
+    flagsobj = context.make_helper(builder, types.ArrayFlags(typ))
+    flagsobj.parent = value
+    res = flagsobj._getvalue()
+    context.nrt.incref(builder, typ, value)
+    return impl_ret_new_ref(context, builder, typ, res)
+
+
+@lower_getattr(types.ArrayFlags, "contiguous")
+@lower_getattr(types.ArrayFlags, "c_contiguous")
+def array_flags_c_contiguous(context, builder, typ, value):
+    if typ.array_type.layout != "C":
+        # any layout can still be contiguous
+        flagsobj = context.make_helper(builder, typ, value=value)
+        res = _call_contiguous_check(
+            is_contiguous, context, builder, typ.array_type, flagsobj.parent
+        )
+    else:
+        val = typ.array_type.layout == "C"
+        res = context.get_constant(types.boolean, val)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+@lower_getattr(types.ArrayFlags, "f_contiguous")
+def array_flags_f_contiguous(context, builder, typ, value):
+    if typ.array_type.layout != "F":
+        # any layout can still be contiguous
+        flagsobj = context.make_helper(builder, typ, value=value)
+        res = _call_contiguous_check(
+            is_fortran, context, builder, typ.array_type, flagsobj.parent
+        )
+    else:
+        layout = typ.array_type.layout
+        val = layout == "F" if typ.array_type.ndim > 1 else layout in "CF"
+        res = context.get_constant(types.boolean, val)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+# ------------------------------------------------------------------------------
+# .real / .imag
+
+
+@lower_getattr(types.Array, "real")
+def array_real_part(context, builder, typ, value):
+    if typ.dtype in types.complex_domain:
+        return array_complex_attr(context, builder, typ, value, attr="real")
+    elif typ.dtype in types.number_domain:
+        # as an identity function
+        return impl_ret_borrowed(context, builder, typ, value)
+    else:
+        raise NotImplementedError("unsupported .real for {}".format(type.dtype))
+
+
+@lower_getattr(types.Array, "imag")
+def array_imag_part(context, builder, typ, value):
+    if typ.dtype in types.complex_domain:
+        return array_complex_attr(context, builder, typ, value, attr="imag")
+    elif typ.dtype in types.number_domain:
+        # return a readonly zero array
+        sig = signature(typ.copy(readonly=True), typ)
+        arrtype, shapes = _parse_empty_like_args(context, builder, sig, [value])
+        ary = _empty_nd_impl(context, builder, arrtype, shapes)
+        cgutils.memset(
+            builder, ary.data, builder.mul(ary.itemsize, ary.nitems), 0
+        )
+        return impl_ret_new_ref(
+            context, builder, sig.return_type, ary._getvalue()
+        )
+    else:
+        raise NotImplementedError("unsupported .imag for {}".format(type.dtype))
+
+
+def array_complex_attr(context, builder, typ, value, attr):
+    """
+    Given a complex array, it's memory layout is:
+
+        R C R C R C
+        ^   ^   ^
+
+    (`R` indicates a float for the real part;
+     `C` indicates a float for the imaginary part;
+     the `^` indicates the start of each element)
+
+    To get the real part, we can simply change the dtype and itemsize to that
+    of the underlying float type.  The new layout is:
+
+        R x R x R x
+        ^   ^   ^
+
+    (`x` indicates unused)
+
+    A load operation will use the dtype to determine the number of bytes to
+    load.
+
+    To get the imaginary part, we shift the pointer by 1 float offset and
+    change the dtype and itemsize.  The new layout is:
+
+        x C x C x C
+          ^   ^   ^
+    """
+    if attr not in ["real", "imag"] or typ.dtype not in types.complex_domain:
+        raise NotImplementedError("cannot get attribute `{}`".format(attr))
+
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+
+    # sizeof underlying float type
+    flty = typ.dtype.underlying_float
+    sizeof_flty = context.get_abi_sizeof(context.get_data_type(flty))
+    itemsize = array.itemsize.type(sizeof_flty)
+
+    # cast data pointer to float type
+    llfltptrty = context.get_value_type(flty).as_pointer()
+    dataptr = builder.bitcast(array.data, llfltptrty)
+
+    # add offset
+    if attr == "imag":
+        dataptr = builder.gep(dataptr, [ir.IntType(32)(1)])
+
+    # make result
+    resultty = typ.copy(dtype=flty, layout="A")
+    result = make_array(resultty)(context, builder)
+    repl = dict(data=dataptr, itemsize=itemsize)
+    cgutils.copy_struct(result, array, repl)
+    return impl_ret_borrowed(context, builder, resultty, result._getvalue())
+
+
+@overload_method(types.Array, "conj")
+@overload_method(types.Array, "conjugate")
+def array_conj(arr):
+    def impl(arr):
+        return np.conj(arr)
+
+    return impl
+
+
+# ------------------------------------------------------------------------------
+# DType attribute
+
+
+def dtype_type(context, builder, dtypety, dtypeval):
+    # Just return a dummy opaque value
+    return context.get_dummy_value()
+
+
+lower_getattr(types.DType, "type")(dtype_type)
+lower_getattr(types.DType, "kind")(dtype_type)
+
+
+# ------------------------------------------------------------------------------
+# static_getitem on Numba numerical types to create "array" types
+
+
+@lower_builtin("static_getitem", types.NumberClass, types.Any)
+def static_getitem_number_clazz(context, builder, sig, args):
+    """This handles the "static_getitem" when a Numba type is subscripted e.g:
+    var = typed.List.empty_list(float64[::1, :])
+    It only allows this on simple numerical types. Compound types, like
+    records, are not supported.
+    """
+    retty = sig.return_type
+    if isinstance(retty, types.Array):
+        # This isn't used or practically accessible, but has to exist, so just
+        # put in a NULL of the right type.
+        res = context.get_value_type(retty)(None)
+        return impl_ret_untracked(context, builder, retty, res)
+    else:
+        # This should be unreachable unless the implementation on the Type
+        # metaclass is changed.
+        msg = (
+            "Unreachable; the definition of __getitem__ on the "
+            "numba.types.abstract.Type metaclass should prevent access."
+        )
+        raise errors.LoweringError(msg)
+
+
+# ------------------------------------------------------------------------------
+# Structured / record lookup
+
+
+@lower_getattr_generic(types.Array)
+def array_record_getattr(context, builder, typ, value, attr):
+    """
+    Generic getattr() implementation for record arrays: fetch the given
+    record member, i.e. a subarray.
+    """
+    arrayty = make_array(typ)
+    array = arrayty(context, builder, value)
+
+    rectype = typ.dtype
+    if not isinstance(rectype, types.Record):
+        raise NotImplementedError(
+            "attribute %r of %s not defined" % (attr, typ)
+        )
+    dtype = rectype.typeof(attr)
+    offset = rectype.offset(attr)
+
+    if isinstance(dtype, types.NestedArray):
+        resty = typ.copy(
+            dtype=dtype.dtype, ndim=typ.ndim + dtype.ndim, layout="A"
+        )
+    else:
+        resty = typ.copy(dtype=dtype, layout="A")
+
+    raryty = make_array(resty)
+
+    rary = raryty(context, builder)
+
+    constoffset = context.get_constant(types.intp, offset)
+
+    newdataptr = cgutils.pointer_add(
+        builder,
+        array.data,
+        constoffset,
+        return_type=rary.data.type,
+    )
+    if isinstance(dtype, types.NestedArray):
+        # new shape = recarray shape + inner dimension from nestedarray
+        shape = cgutils.unpack_tuple(builder, array.shape, typ.ndim)
+        shape += [context.get_constant(types.intp, i) for i in dtype.shape]
+        # new strides = recarray strides + strides of the inner nestedarray
+        strides = cgutils.unpack_tuple(builder, array.strides, typ.ndim)
+        strides += [context.get_constant(types.intp, i) for i in dtype.strides]
+        # New datasize = size of elements of the nestedarray
+        datasize = context.get_abi_sizeof(context.get_data_type(dtype.dtype))
+    else:
+        # New shape, strides, and datasize match the underlying array
+        shape = array.shape
+        strides = array.strides
+        datasize = context.get_abi_sizeof(context.get_data_type(dtype))
+    populate_array(
+        rary,
+        data=newdataptr,
+        shape=shape,
+        strides=strides,
+        itemsize=context.get_constant(types.intp, datasize),
+        meminfo=array.meminfo,
+        parent=array.parent,
+    )
+    res = rary._getvalue()
+    return impl_ret_borrowed(context, builder, resty, res)
+
+
+@lower_builtin("static_getitem", types.Array, types.StringLiteral)
+def array_record_getitem(context, builder, sig, args):
+    index = args[1]
+    if not isinstance(index, str):
+        # This will fallback to normal getitem
+        raise NotImplementedError
+    return array_record_getattr(context, builder, sig.args[0], args[0], index)
+
+
+@lower_getattr_generic(types.Record)
+def record_getattr(context, builder, typ, value, attr):
+    """
+    Generic getattr() implementation for records: get the given record member.
+    """
+    context.sentry_record_alignment(typ, attr)
+    offset = typ.offset(attr)
+    elemty = typ.typeof(attr)
+
+    if isinstance(elemty, types.NestedArray):
+        # Only a nested array's *data* is stored in a structured array,
+        # so we create an array structure to point to that data.
+        aryty = make_array(elemty)
+        ary = aryty(context, builder)
+        dtype = elemty.dtype
+        newshape = [context.get_constant(types.intp, s) for s in elemty.shape]
+        newstrides = [
+            context.get_constant(types.intp, s) for s in elemty.strides
+        ]
+        newdata = cgutils.get_record_member(
+            builder, value, offset, context.get_data_type(dtype)
+        )
+        populate_array(
+            ary,
+            data=newdata,
+            shape=cgutils.pack_array(builder, newshape),
+            strides=cgutils.pack_array(builder, newstrides),
+            itemsize=context.get_constant(types.intp, elemty.size),
+            meminfo=None,
+            parent=None,
+        )
+        res = ary._getvalue()
+        return impl_ret_borrowed(context, builder, typ, res)
+    else:
+        dptr = cgutils.get_record_member(
+            builder, value, offset, context.get_data_type(elemty)
+        )
+        align = None if typ.aligned else 1
+        res = context.unpack_value(builder, elemty, dptr, align)
+        return impl_ret_borrowed(context, builder, typ, res)
+
+
+@lower_setattr_generic(types.Record)
+def record_setattr(context, builder, sig, args, attr):
+    """
+    Generic setattr() implementation for records: set the given record member.
+    """
+    typ, valty = sig.args
+    target, val = args
+
+    context.sentry_record_alignment(typ, attr)
+    offset = typ.offset(attr)
+    elemty = typ.typeof(attr)
+
+    if isinstance(elemty, types.NestedArray):
+        # Copy the data from the RHS into the nested array
+        val_struct = cgutils.create_struct_proxy(valty)(
+            context, builder, value=args[1]
+        )
+        src = val_struct.data
+        dest = cgutils.get_record_member(
+            builder, target, offset, src.type.pointee
+        )
+        cgutils.memcpy(
+            builder, dest, src, context.get_constant(types.intp, elemty.nitems)
+        )
+    else:
+        # Set the given scalar record member
+        dptr = cgutils.get_record_member(
+            builder, target, offset, context.get_data_type(elemty)
+        )
+        val = context.cast(builder, val, valty, elemty)
+        align = None if typ.aligned else 1
+        context.pack_value(builder, elemty, val, dptr, align=align)
+
+
+@lower_builtin("static_getitem", types.Record, types.StringLiteral)
+def record_static_getitem_str(context, builder, sig, args):
+    """
+    Record.__getitem__ redirects to getattr()
+    """
+    impl = context.get_getattr(sig.args[0], args[1])
+    return impl(context, builder, sig.args[0], args[0], args[1])
+
+
+@lower_builtin("static_getitem", types.Record, types.IntegerLiteral)
+def record_static_getitem_int(context, builder, sig, args):
+    """
+    Record.__getitem__ redirects to getattr()
+    """
+    idx = sig.args[1].literal_value
+    fields = list(sig.args[0].fields)
+    ll_field = context.insert_const_string(builder.module, fields[idx])
+    impl = context.get_getattr(sig.args[0], ll_field)
+    return impl(context, builder, sig.args[0], args[0], fields[idx])
+
+
+@lower_builtin("static_setitem", types.Record, types.StringLiteral, types.Any)
+def record_static_setitem_str(context, builder, sig, args):
+    """
+    Record.__setitem__ redirects to setattr()
+    """
+    recty, _, valty = sig.args
+    rec, idx, val = args
+    getattr_sig = signature(sig.return_type, recty, valty)
+    impl = context.get_setattr(idx, getattr_sig)
+    assert impl is not None
+    return impl(builder, (rec, val))
+
+
+@lower_builtin("static_setitem", types.Record, types.IntegerLiteral, types.Any)
+def record_static_setitem_int(context, builder, sig, args):
+    """
+    Record.__setitem__ redirects to setattr()
+    """
+    recty, _, valty = sig.args
+    rec, idx, val = args
+    getattr_sig = signature(sig.return_type, recty, valty)
+    fields = list(sig.args[0].fields)
+    impl = context.get_setattr(fields[idx], getattr_sig)
+    assert impl is not None
+    return impl(builder, (rec, val))
+
+
+# ------------------------------------------------------------------------------
+# Constant arrays and records
+
+
+@lower_constant(types.Array)
+def constant_array(context, builder, ty, pyval):
+    """
+    Create a constant array (mechanism is target-dependent).
+    """
+    return context.make_constant_array(builder, ty, pyval)
+
+
+@lower_constant(types.Record)
+def constant_record(context, builder, ty, pyval):
+    """
+    Create a record constant as a stack-allocated array of bytes.
+    """
+    lty = ir.ArrayType(ir.IntType(8), pyval.nbytes)
+    val = lty(bytearray(pyval.tostring()))
+    return cgutils.alloca_once_value(builder, val)
+
+
+@lower_constant(types.Bytes)
+def constant_bytes(context, builder, ty, pyval):
+    """
+    Create a constant array from bytes (mechanism is target-dependent).
+    """
+    buf = np.array(bytearray(pyval), dtype=np.uint8)
+    return context.make_constant_array(builder, ty, buf)
+
+
+# ------------------------------------------------------------------------------
+# Comparisons
+
+
+@lower_builtin(operator.is_, types.Array, types.Array)
+def array_is(context, builder, sig, args):
+    aty, bty = sig.args
+    if aty != bty:
+        return cgutils.false_bit
+
+    def array_is_impl(a, b):
+        return (
+            a.shape == b.shape
+            and a.strides == b.strides
+            and a.ctypes.data == b.ctypes.data
+        )
+
+    return context.compile_internal(builder, array_is_impl, sig, args)
+
+
+# ------------------------------------------------------------------------------
+# Hash
+
+
+@overload_attribute(types.Array, "__hash__")
+def ol_array_hash(arr):
+    return lambda arr: None
+
+
+# ------------------------------------------------------------------------------
+# builtin `np.flat` implementation
+
+
+def make_array_flat_cls(flatiterty):
+    """
+    Return the Structure representation of the given *flatiterty* (an
+    instance of types.NumpyFlatType).
+    """
+    return _make_flattening_iter_cls(flatiterty, "flat")
+
+
+def make_array_ndenumerate_cls(nditerty):
+    """
+    Return the Structure representation of the given *nditerty* (an
+    instance of types.NumpyNdEnumerateType).
+    """
+    return _make_flattening_iter_cls(nditerty, "ndenumerate")
+
+
+def _increment_indices(
+    context,
+    builder,
+    ndim,
+    shape,
+    indices,
+    end_flag=None,
+    loop_continue=None,
+    loop_break=None,
+):
+    zero = context.get_constant(types.intp, 0)
+
+    bbend = builder.append_basic_block("end_increment")
+
+    if end_flag is not None:
+        builder.store(cgutils.false_byte, end_flag)
+
+    for dim in reversed(range(ndim)):
+        idxptr = cgutils.gep_inbounds(builder, indices, dim)
+        idx = cgutils.increment_index(builder, builder.load(idxptr))
+
+        count = shape[dim]
+        in_bounds = builder.icmp_signed("<", idx, count)
+        with cgutils.if_likely(builder, in_bounds):
+            # New index is still in bounds
+            builder.store(idx, idxptr)
+            if loop_continue is not None:
+                loop_continue(dim)
+            builder.branch(bbend)
+        # Index out of bounds => reset it and proceed it to outer index
+        builder.store(zero, idxptr)
+        if loop_break is not None:
+            loop_break(dim)
+
+    if end_flag is not None:
+        builder.store(cgutils.true_byte, end_flag)
+    builder.branch(bbend)
+
+    builder.position_at_end(bbend)
+
+
+def _increment_indices_array(
+    context, builder, arrty, arr, indices, end_flag=None
+):
+    shape = cgutils.unpack_tuple(builder, arr.shape, arrty.ndim)
+    _increment_indices(context, builder, arrty.ndim, shape, indices, end_flag)
+
+
+def make_nditer_cls(nditerty):
+    """
+    Return the Structure representation of the given *nditerty* (an
+    instance of types.NumpyNdIterType).
+    """
+    ndim = nditerty.ndim
+    layout = nditerty.layout
+    narrays = len(nditerty.arrays)
+    nshapes = ndim if nditerty.need_shaped_indexing else 1
+
+    class BaseSubIter(object):
+        """
+        Base class for sub-iterators of a nditer() instance.
+        """
+
+        def __init__(self, nditer, member_name, start_dim, end_dim):
+            self.nditer = nditer
+            self.member_name = member_name
+            self.start_dim = start_dim
+            self.end_dim = end_dim
+            self.ndim = end_dim - start_dim
+
+        def set_member_ptr(self, ptr):
+            setattr(self.nditer, self.member_name, ptr)
+
+        @functools.cached_property
+        def member_ptr(self):
+            return getattr(self.nditer, self.member_name)
+
+        def init_specific(self, context, builder):
+            pass
+
+        def loop_continue(self, context, builder, logical_dim):
+            pass
+
+        def loop_break(self, context, builder, logical_dim):
+            pass
+
+    class FlatSubIter(BaseSubIter):
+        """
+        Sub-iterator walking a contiguous array in physical order, with
+        support for broadcasting (the index is reset on the outer dimension).
+        """
+
+        def init_specific(self, context, builder):
+            zero = context.get_constant(types.intp, 0)
+            self.set_member_ptr(cgutils.alloca_once_value(builder, zero))
+
+        def compute_pointer(self, context, builder, indices, arrty, arr):
+            index = builder.load(self.member_ptr)
+            return builder.gep(arr.data, [index])
+
+        def loop_continue(self, context, builder, logical_dim):
+            if logical_dim == self.ndim - 1:
+                # Only increment index inside innermost logical dimension
+                index = builder.load(self.member_ptr)
+                index = cgutils.increment_index(builder, index)
+                builder.store(index, self.member_ptr)
+
+        def loop_break(self, context, builder, logical_dim):
+            if logical_dim == 0:
+                # At the exit of outermost logical dimension, reset index
+                zero = context.get_constant(types.intp, 0)
+                builder.store(zero, self.member_ptr)
+            elif logical_dim == self.ndim - 1:
+                # Inside innermost logical dimension, increment index
+                index = builder.load(self.member_ptr)
+                index = cgutils.increment_index(builder, index)
+                builder.store(index, self.member_ptr)
+
+    class TrivialFlatSubIter(BaseSubIter):
+        """
+        Sub-iterator walking a contiguous array in physical order,
+        *without* support for broadcasting.
+        """
+
+        def init_specific(self, context, builder):
+            assert not nditerty.need_shaped_indexing
+
+        def compute_pointer(self, context, builder, indices, arrty, arr):
+            assert len(indices) <= 1, len(indices)
+            return builder.gep(arr.data, indices)
+
+    class IndexedSubIter(BaseSubIter):
+        """
+        Sub-iterator walking an array in logical order.
+        """
+
+        def compute_pointer(self, context, builder, indices, arrty, arr):
+            assert len(indices) == self.ndim
+            return cgutils.get_item_pointer(
+                context, builder, arrty, arr, indices, wraparound=False
+            )
+
+    class ZeroDimSubIter(BaseSubIter):
+        """
+        Sub-iterator "walking" a 0-d array.
+        """
+
+        def compute_pointer(self, context, builder, indices, arrty, arr):
+            return arr.data
+
+    class ScalarSubIter(BaseSubIter):
+        """
+        Sub-iterator "walking" a scalar value.
+        """
+
+        def compute_pointer(self, context, builder, indices, arrty, arr):
+            return arr
+
+    class NdIter(cgutils.create_struct_proxy(nditerty)):
+        """
+        .nditer() implementation.
+
+        Note: 'F' layout means the shape is iterated in reverse logical order,
+        so indices and shapes arrays have to be reversed as well.
+        """
+
+        @functools.cached_property
+        def subiters(self):
+            l = []
+            factories = {
+                "flat": FlatSubIter
+                if nditerty.need_shaped_indexing
+                else TrivialFlatSubIter,
+                "indexed": IndexedSubIter,
+                "0d": ZeroDimSubIter,
+                "scalar": ScalarSubIter,
+            }
+            for i, sub in enumerate(nditerty.indexers):
+                kind, start_dim, end_dim, _ = sub
+                member_name = "index%d" % i
+                factory = factories[kind]
+                l.append(factory(self, member_name, start_dim, end_dim))
+            return l
+
+        def init_specific(self, context, builder, arrtys, arrays):
+            """
+            Initialize the nditer() instance for the specific array inputs.
+            """
+            zero = context.get_constant(types.intp, 0)
+
+            # Store inputs
+            self.arrays = context.make_tuple(
+                builder, types.Tuple(arrtys), arrays
+            )
+            # Create slots for scalars
+            for i, ty in enumerate(arrtys):
+                if not isinstance(ty, types.Array):
+                    member_name = "scalar%d" % i
+                    # XXX as_data()?
+                    slot = cgutils.alloca_once_value(builder, arrays[i])
+                    setattr(self, member_name, slot)
+
+            arrays = self._arrays_or_scalars(context, builder, arrtys, arrays)
+
+            # Extract iterator shape (the shape of the most-dimensional input)
+            main_shape_ty = types.UniTuple(types.intp, ndim)
+            main_shape = None
+            main_nitems = None
+            for i, arrty in enumerate(arrtys):
+                if isinstance(arrty, types.Array) and arrty.ndim == ndim:
+                    main_shape = arrays[i].shape
+                    main_nitems = arrays[i].nitems
+                    break
+            else:
+                # Only scalar inputs => synthesize a dummy shape
+                assert ndim == 0
+                main_shape = context.make_tuple(builder, main_shape_ty, ())
+                main_nitems = context.get_constant(types.intp, 1)
+
+            # Validate shapes of array inputs
+            def check_shape(shape, main_shape):
+                n = len(shape)
+                for i in range(n):
+                    if shape[i] != main_shape[len(main_shape) - n + i]:
+                        raise ValueError(
+                            "nditer(): operands could not be broadcast together"
+                        )
+
+            for arrty, arr in zip(arrtys, arrays):
+                if isinstance(arrty, types.Array) and arrty.ndim > 0:
+                    sig = signature(
+                        types.none,
+                        types.UniTuple(types.intp, arrty.ndim),
+                        main_shape_ty,
+                    )
+                    context.compile_internal(
+                        builder, check_shape, sig, (arr.shape, main_shape)
+                    )
+
+            # Compute shape and size
+            shapes = cgutils.unpack_tuple(builder, main_shape)
+            if layout == "F":
+                shapes = shapes[::-1]
+
+            # If shape is empty, mark iterator exhausted
+            shape_is_empty = builder.icmp_signed("==", main_nitems, zero)
+            exhausted = builder.select(
+                shape_is_empty, cgutils.true_byte, cgutils.false_byte
+            )
+
+            if not nditerty.need_shaped_indexing:
+                # Flatten shape to make iteration faster on small innermost
+                # dimensions (e.g. a (100000, 3) shape)
+                shapes = (main_nitems,)
+            assert len(shapes) == nshapes
+
+            indices = cgutils.alloca_once(builder, zero.type, size=nshapes)
+            for dim in range(nshapes):
+                idxptr = cgutils.gep_inbounds(builder, indices, dim)
+                builder.store(zero, idxptr)
+
+            self.indices = indices
+            self.shape = cgutils.pack_array(builder, shapes, zero.type)
+            self.exhausted = cgutils.alloca_once_value(builder, exhausted)
+
+            # Initialize subiterators
+            for subiter in self.subiters:
+                subiter.init_specific(context, builder)
+
+        def iternext_specific(self, context, builder, result):
+            """
+            Compute next iteration of the nditer() instance.
+            """
+            bbend = builder.append_basic_block("end")
+
+            # Branch early if exhausted
+            exhausted = cgutils.as_bool_bit(
+                builder, builder.load(self.exhausted)
+            )
+            with cgutils.if_unlikely(builder, exhausted):
+                result.set_valid(False)
+                builder.branch(bbend)
+
+            arrtys = nditerty.arrays
+            arrays = cgutils.unpack_tuple(builder, self.arrays)
+            arrays = self._arrays_or_scalars(context, builder, arrtys, arrays)
+            indices = self.indices
+
+            # Compute iterated results
+            result.set_valid(True)
+            views = self._make_views(context, builder, indices, arrtys, arrays)
+            views = [v._getvalue() for v in views]
+            if len(views) == 1:
+                result.yield_(views[0])
+            else:
+                result.yield_(
+                    context.make_tuple(builder, nditerty.yield_type, views)
+                )
+
+            shape = cgutils.unpack_tuple(builder, self.shape)
+            _increment_indices(
+                context,
+                builder,
+                len(shape),
+                shape,
+                indices,
+                self.exhausted,
+                functools.partial(self._loop_continue, context, builder),
+                functools.partial(self._loop_break, context, builder),
+            )
+
+            builder.branch(bbend)
+            builder.position_at_end(bbend)
+
+        def _loop_continue(self, context, builder, dim):
+            for sub in self.subiters:
+                if sub.start_dim <= dim < sub.end_dim:
+                    sub.loop_continue(context, builder, dim - sub.start_dim)
+
+        def _loop_break(self, context, builder, dim):
+            for sub in self.subiters:
+                if sub.start_dim <= dim < sub.end_dim:
+                    sub.loop_break(context, builder, dim - sub.start_dim)
+
+        def _make_views(self, context, builder, indices, arrtys, arrays):
+            """
+            Compute the views to be yielded.
+            """
+            views = [None] * narrays
+            indexers = nditerty.indexers
+            subiters = self.subiters
+            rettys = nditerty.yield_type
+            if isinstance(rettys, types.BaseTuple):
+                rettys = list(rettys)
+            else:
+                rettys = [rettys]
+            indices = [
+                builder.load(cgutils.gep_inbounds(builder, indices, i))
+                for i in range(nshapes)
+            ]
+
+            for sub, subiter in zip(indexers, subiters):
+                _, _, _, array_indices = sub
+                sub_indices = indices[subiter.start_dim : subiter.end_dim]
+                if layout == "F":
+                    sub_indices = sub_indices[::-1]
+                for i in array_indices:
+                    assert views[i] is None
+                    views[i] = self._make_view(
+                        context,
+                        builder,
+                        sub_indices,
+                        rettys[i],
+                        arrtys[i],
+                        arrays[i],
+                        subiter,
+                    )
+            assert all(v for v in views)
+            return views
+
+        def _make_view(
+            self, context, builder, indices, retty, arrty, arr, subiter
+        ):
+            """
+            Compute a 0d view for a given input array.
+            """
+            assert isinstance(retty, types.Array) and retty.ndim == 0
+
+            ptr = subiter.compute_pointer(context, builder, indices, arrty, arr)
+            view = context.make_array(retty)(context, builder)
+
+            itemsize = get_itemsize(context, retty)
+            shape = context.make_tuple(
+                builder, types.UniTuple(types.intp, 0), ()
+            )
+            strides = context.make_tuple(
+                builder, types.UniTuple(types.intp, 0), ()
+            )
+            # HACK: meminfo=None avoids expensive refcounting operations
+            # on ephemeral views
+            populate_array(view, ptr, shape, strides, itemsize, meminfo=None)
+            return view
+
+        def _arrays_or_scalars(self, context, builder, arrtys, arrays):
+            # Return a list of either array structures or pointers to
+            # scalar slots
+            l = []
+            for i, (arrty, arr) in enumerate(zip(arrtys, arrays)):
+                if isinstance(arrty, types.Array):
+                    l.append(
+                        context.make_array(arrty)(context, builder, value=arr)
+                    )
+                else:
+                    l.append(getattr(self, "scalar%d" % i))
+            return l
+
+    return NdIter
+
+
+def make_ndindex_cls(nditerty):
+    """
+    Return the Structure representation of the given *nditerty* (an
+    instance of types.NumpyNdIndexType).
+    """
+    ndim = nditerty.ndim
+
+    class NdIndexIter(cgutils.create_struct_proxy(nditerty)):
+        """
+        .ndindex() implementation.
+        """
+
+        def init_specific(self, context, builder, shapes):
+            zero = context.get_constant(types.intp, 0)
+            indices = cgutils.alloca_once(
+                builder, zero.type, size=context.get_constant(types.intp, ndim)
+            )
+            exhausted = cgutils.alloca_once_value(builder, cgutils.false_byte)
+
+            for dim in range(ndim):
+                idxptr = cgutils.gep_inbounds(builder, indices, dim)
+                builder.store(zero, idxptr)
+                # 0-sized dimensions really indicate an empty array,
+                # but we have to catch that condition early to avoid
+                # a bug inside the iteration logic.
+                dim_size = shapes[dim]
+                dim_is_empty = builder.icmp_unsigned("==", dim_size, zero)
+                with cgutils.if_unlikely(builder, dim_is_empty):
+                    builder.store(cgutils.true_byte, exhausted)
+
+            self.indices = indices
+            self.exhausted = exhausted
+            self.shape = cgutils.pack_array(builder, shapes, zero.type)
+
+        def iternext_specific(self, context, builder, result):
+            zero = context.get_constant(types.intp, 0)
+
+            bbend = builder.append_basic_block("end")
+
+            exhausted = cgutils.as_bool_bit(
+                builder, builder.load(self.exhausted)
+            )
+            with cgutils.if_unlikely(builder, exhausted):
+                result.set_valid(False)
+                builder.branch(bbend)
+
+            indices = [
+                builder.load(cgutils.gep_inbounds(builder, self.indices, dim))
+                for dim in range(ndim)
+            ]
+            for load in indices:
+                mark_positive(builder, load)
+
+            result.yield_(cgutils.pack_array(builder, indices, zero.type))
+            result.set_valid(True)
+
+            shape = cgutils.unpack_tuple(builder, self.shape, ndim)
+            _increment_indices(
+                context, builder, ndim, shape, self.indices, self.exhausted
+            )
+
+            builder.branch(bbend)
+            builder.position_at_end(bbend)
+
+    return NdIndexIter
+
+
+def _make_flattening_iter_cls(flatiterty, kind):
+    assert kind in ("flat", "ndenumerate")
+
+    array_type = flatiterty.array_type
+
+    if array_type.layout == "C":
+
+        class CContiguousFlatIter(cgutils.create_struct_proxy(flatiterty)):
+            """
+            .flat() / .ndenumerate() implementation for C-contiguous arrays.
+            """
+
+            def init_specific(self, context, builder, arrty, arr):
+                zero = context.get_constant(types.intp, 0)
+                self.index = cgutils.alloca_once_value(builder, zero)
+                # We can't trust strides[-1] to always contain the right
+                # step value, see
+                # http://docs.scipy.org/doc/numpy-dev/release.html#npy-relaxed-strides-checking    # noqa: E501
+                self.stride = arr.itemsize
+
+                if kind == "ndenumerate":
+                    # Zero-initialize the indices array.
+                    indices = cgutils.alloca_once(
+                        builder,
+                        zero.type,
+                        size=context.get_constant(types.intp, arrty.ndim),
+                    )
+
+                    for dim in range(arrty.ndim):
+                        idxptr = cgutils.gep_inbounds(builder, indices, dim)
+                        builder.store(zero, idxptr)
+
+                    self.indices = indices
+
+            # NOTE: Using gep() instead of explicit pointer addition helps
+            # LLVM vectorize the loop (since the stride is known and
+            # constant).  This is not possible in the non-contiguous case,
+            # where the strides are unknown at compile-time.
+
+            def iternext_specific(self, context, builder, arrty, arr, result):
+                ndim = arrty.ndim
+                nitems = arr.nitems
+
+                index = builder.load(self.index)
+                is_valid = builder.icmp_signed("<", index, nitems)
+                result.set_valid(is_valid)
+
+                with cgutils.if_likely(builder, is_valid):
+                    ptr = builder.gep(arr.data, [index])
+                    value = load_item(context, builder, arrty, ptr)
+                    if kind == "flat":
+                        result.yield_(value)
+                    else:
+                        # ndenumerate(): fetch and increment indices
+                        indices = self.indices
+                        idxvals = [
+                            builder.load(
+                                cgutils.gep_inbounds(builder, indices, dim)
+                            )
+                            for dim in range(ndim)
+                        ]
+                        idxtuple = cgutils.pack_array(builder, idxvals)
+                        result.yield_(
+                            cgutils.make_anonymous_struct(
+                                builder, [idxtuple, value]
+                            )
+                        )
+                        _increment_indices_array(
+                            context, builder, arrty, arr, indices
+                        )
+
+                    index = cgutils.increment_index(builder, index)
+                    builder.store(index, self.index)
+
+            def getitem(self, context, builder, arrty, arr, index):
+                ptr = builder.gep(arr.data, [index])
+                return load_item(context, builder, arrty, ptr)
+
+            def setitem(self, context, builder, arrty, arr, index, value):
+                ptr = builder.gep(arr.data, [index])
+                store_item(context, builder, arrty, value, ptr)
+
+        return CContiguousFlatIter
+
+    else:
+
+        class FlatIter(cgutils.create_struct_proxy(flatiterty)):
+            """
+            Generic .flat() / .ndenumerate() implementation for
+            non-contiguous arrays.
+            It keeps track of pointers along each dimension in order to
+            minimize computations.
+            """
+
+            def init_specific(self, context, builder, arrty, arr):
+                zero = context.get_constant(types.intp, 0)
+                data = arr.data
+                ndim = arrty.ndim
+                shapes = cgutils.unpack_tuple(builder, arr.shape, ndim)
+
+                indices = cgutils.alloca_once(
+                    builder,
+                    zero.type,
+                    size=context.get_constant(types.intp, arrty.ndim),
+                )
+                pointers = cgutils.alloca_once(
+                    builder,
+                    data.type,
+                    size=context.get_constant(types.intp, arrty.ndim),
+                )
+                exhausted = cgutils.alloca_once_value(
+                    builder, cgutils.false_byte
+                )
+
+                # Initialize indices and pointers with their start values.
+                for dim in range(ndim):
+                    idxptr = cgutils.gep_inbounds(builder, indices, dim)
+                    ptrptr = cgutils.gep_inbounds(builder, pointers, dim)
+                    builder.store(data, ptrptr)
+                    builder.store(zero, idxptr)
+                    # 0-sized dimensions really indicate an empty array,
+                    # but we have to catch that condition early to avoid
+                    # a bug inside the iteration logic (see issue #846).
+                    dim_size = shapes[dim]
+                    dim_is_empty = builder.icmp_unsigned("==", dim_size, zero)
+                    with cgutils.if_unlikely(builder, dim_is_empty):
+                        builder.store(cgutils.true_byte, exhausted)
+
+                self.indices = indices
+                self.pointers = pointers
+                self.exhausted = exhausted
+
+            def iternext_specific(self, context, builder, arrty, arr, result):
+                ndim = arrty.ndim
+                shapes = cgutils.unpack_tuple(builder, arr.shape, ndim)
+                strides = cgutils.unpack_tuple(builder, arr.strides, ndim)
+                indices = self.indices
+                pointers = self.pointers
+
+                zero = context.get_constant(types.intp, 0)
+
+                bbend = builder.append_basic_block("end")
+
+                # Catch already computed iterator exhaustion
+                is_exhausted = cgutils.as_bool_bit(
+                    builder, builder.load(self.exhausted)
+                )
+                with cgutils.if_unlikely(builder, is_exhausted):
+                    result.set_valid(False)
+                    builder.branch(bbend)
+                result.set_valid(True)
+
+                # Current pointer inside last dimension
+                last_ptr = cgutils.gep_inbounds(builder, pointers, ndim - 1)
+                ptr = builder.load(last_ptr)
+                value = load_item(context, builder, arrty, ptr)
+                if kind == "flat":
+                    result.yield_(value)
+                else:
+                    # ndenumerate() => yield (indices, value)
+                    idxvals = [
+                        builder.load(
+                            cgutils.gep_inbounds(builder, indices, dim)
+                        )
+                        for dim in range(ndim)
+                    ]
+                    idxtuple = cgutils.pack_array(builder, idxvals)
+                    result.yield_(
+                        cgutils.make_anonymous_struct(
+                            builder, [idxtuple, value]
+                        )
+                    )
+
+                # Update indices and pointers by walking from inner
+                # dimension to outer.
+                for dim in reversed(range(ndim)):
+                    idxptr = cgutils.gep_inbounds(builder, indices, dim)
+                    idx = cgutils.increment_index(builder, builder.load(idxptr))
+
+                    count = shapes[dim]
+                    stride = strides[dim]
+                    in_bounds = builder.icmp_signed("<", idx, count)
+                    with cgutils.if_likely(builder, in_bounds):
+                        # Index is valid => pointer can simply be incremented.
+                        builder.store(idx, idxptr)
+                        ptrptr = cgutils.gep_inbounds(builder, pointers, dim)
+                        ptr = builder.load(ptrptr)
+                        ptr = cgutils.pointer_add(builder, ptr, stride)
+                        builder.store(ptr, ptrptr)
+                        # Reset pointers in inner dimensions
+                        for inner_dim in range(dim + 1, ndim):
+                            ptrptr = cgutils.gep_inbounds(
+                                builder, pointers, inner_dim
+                            )
+                            builder.store(ptr, ptrptr)
+                        builder.branch(bbend)
+                    # Reset index and continue with next dimension
+                    builder.store(zero, idxptr)
+
+                # End of array
+                builder.store(cgutils.true_byte, self.exhausted)
+                builder.branch(bbend)
+
+                builder.position_at_end(bbend)
+
+            def _ptr_for_index(self, context, builder, arrty, arr, index):
+                ndim = arrty.ndim
+                shapes = cgutils.unpack_tuple(builder, arr.shape, count=ndim)
+                strides = cgutils.unpack_tuple(builder, arr.strides, count=ndim)
+
+                # First convert the flattened index into a regular n-dim index
+                indices = []
+                for dim in reversed(range(ndim)):
+                    indices.append(builder.urem(index, shapes[dim]))
+                    index = builder.udiv(index, shapes[dim])
+                indices.reverse()
+
+                ptr = cgutils.get_item_pointer2(
+                    context,
+                    builder,
+                    arr.data,
+                    shapes,
+                    strides,
+                    arrty.layout,
+                    indices,
+                )
+                return ptr
+
+            def getitem(self, context, builder, arrty, arr, index):
+                ptr = self._ptr_for_index(context, builder, arrty, arr, index)
+                return load_item(context, builder, arrty, ptr)
+
+            def setitem(self, context, builder, arrty, arr, index, value):
+                ptr = self._ptr_for_index(context, builder, arrty, arr, index)
+                store_item(context, builder, arrty, value, ptr)
+
+        return FlatIter
+
+
+@lower_getattr(types.Array, "flat")
+def make_array_flatiter(context, builder, arrty, arr):
+    flatitercls = make_array_flat_cls(types.NumpyFlatType(arrty))
+    flatiter = flatitercls(context, builder)
+
+    flatiter.array = arr
+
+    arrcls = context.make_array(arrty)
+    arr = arrcls(context, builder, ref=flatiter._get_ptr_by_name("array"))
+
+    flatiter.init_specific(context, builder, arrty, arr)
+
+    res = flatiter._getvalue()
+    return impl_ret_borrowed(context, builder, types.NumpyFlatType(arrty), res)
+
+
+@lower_builtin("iternext", types.NumpyFlatType)
+@iternext_impl(RefType.BORROWED)
+def iternext_numpy_flatiter(context, builder, sig, args, result):
+    [flatiterty] = sig.args
+    [flatiter] = args
+
+    flatitercls = make_array_flat_cls(flatiterty)
+    flatiter = flatitercls(context, builder, value=flatiter)
+
+    arrty = flatiterty.array_type
+    arrcls = context.make_array(arrty)
+    arr = arrcls(context, builder, value=flatiter.array)
+
+    flatiter.iternext_specific(context, builder, arrty, arr, result)
+
+
+@lower_builtin(operator.getitem, types.NumpyFlatType, types.Integer)
+def iternext_numpy_getitem(context, builder, sig, args):
+    flatiterty = sig.args[0]
+    flatiter, index = args
+
+    flatitercls = make_array_flat_cls(flatiterty)
+    flatiter = flatitercls(context, builder, value=flatiter)
+
+    arrty = flatiterty.array_type
+    arrcls = context.make_array(arrty)
+    arr = arrcls(context, builder, value=flatiter.array)
+
+    res = flatiter.getitem(context, builder, arrty, arr, index)
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin(operator.setitem, types.NumpyFlatType, types.Integer, types.Any)
+def iternext_numpy_getitem_any(context, builder, sig, args):
+    flatiterty = sig.args[0]
+    flatiter, index, value = args
+
+    flatitercls = make_array_flat_cls(flatiterty)
+    flatiter = flatitercls(context, builder, value=flatiter)
+
+    arrty = flatiterty.array_type
+    arrcls = context.make_array(arrty)
+    arr = arrcls(context, builder, value=flatiter.array)
+
+    flatiter.setitem(context, builder, arrty, arr, index, value)
+    return context.get_dummy_value()
+
+
+@lower_builtin(len, types.NumpyFlatType)
+def iternext_numpy_getitem_flat(context, builder, sig, args):
+    flatiterty = sig.args[0]
+    flatitercls = make_array_flat_cls(flatiterty)
+    flatiter = flatitercls(context, builder, value=args[0])
+
+    arrcls = context.make_array(flatiterty.array_type)
+    arr = arrcls(context, builder, value=flatiter.array)
+    return arr.nitems
+
+
+@lower_builtin(np.ndenumerate, types.Array)
+def make_array_ndenumerate(context, builder, sig, args):
+    (arrty,) = sig.args
+    (arr,) = args
+    nditercls = make_array_ndenumerate_cls(types.NumpyNdEnumerateType(arrty))
+    nditer = nditercls(context, builder)
+
+    nditer.array = arr
+
+    arrcls = context.make_array(arrty)
+    arr = arrcls(context, builder, ref=nditer._get_ptr_by_name("array"))
+
+    nditer.init_specific(context, builder, arrty, arr)
+
+    res = nditer._getvalue()
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin("iternext", types.NumpyNdEnumerateType)
+@iternext_impl(RefType.BORROWED)
+def iternext_numpy_nditer(context, builder, sig, args, result):
+    [nditerty] = sig.args
+    [nditer] = args
+
+    nditercls = make_array_ndenumerate_cls(nditerty)
+    nditer = nditercls(context, builder, value=nditer)
+
+    arrty = nditerty.array_type
+    arrcls = context.make_array(arrty)
+    arr = arrcls(context, builder, value=nditer.array)
+
+    nditer.iternext_specific(context, builder, arrty, arr, result)
+
+
+@lower_builtin(pndindex, types.VarArg(types.Integer))
+@lower_builtin(np.ndindex, types.VarArg(types.Integer))
+def make_array_ndindex(context, builder, sig, args):
+    """ndindex(*shape)"""
+    shape = [
+        context.cast(builder, arg, argty, types.intp)
+        for argty, arg in zip(sig.args, args)
+    ]
+
+    nditercls = make_ndindex_cls(types.NumpyNdIndexType(len(shape)))
+    nditer = nditercls(context, builder)
+    nditer.init_specific(context, builder, shape)
+
+    res = nditer._getvalue()
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin(pndindex, types.BaseTuple)
+@lower_builtin(np.ndindex, types.BaseTuple)
+def make_array_ndindex_tuple(context, builder, sig, args):
+    """ndindex(shape)"""
+    ndim = sig.return_type.ndim
+    if ndim > 0:
+        idxty = sig.args[0].dtype
+        tup = args[0]
+
+        shape = cgutils.unpack_tuple(builder, tup, ndim)
+        shape = [context.cast(builder, idx, idxty, types.intp) for idx in shape]
+    else:
+        shape = []
+
+    nditercls = make_ndindex_cls(types.NumpyNdIndexType(len(shape)))
+    nditer = nditercls(context, builder)
+    nditer.init_specific(context, builder, shape)
+
+    res = nditer._getvalue()
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+@lower_builtin("iternext", types.NumpyNdIndexType)
+@iternext_impl(RefType.BORROWED)
+def iternext_numpy_ndindex(context, builder, sig, args, result):
+    [nditerty] = sig.args
+    [nditer] = args
+
+    nditercls = make_ndindex_cls(nditerty)
+    nditer = nditercls(context, builder, value=nditer)
+
+    nditer.iternext_specific(context, builder, result)
+
+
+@lower_builtin(np.nditer, types.Any)
+def make_array_nditer(context, builder, sig, args):
+    """
+    nditer(...)
+    """
+    nditerty = sig.return_type
+    arrtys = nditerty.arrays
+
+    if isinstance(sig.args[0], types.BaseTuple):
+        arrays = cgutils.unpack_tuple(builder, args[0])
+    else:
+        arrays = [args[0]]
+
+    nditer = make_nditer_cls(nditerty)(context, builder)
+    nditer.init_specific(context, builder, arrtys, arrays)
+
+    res = nditer._getvalue()
+    return impl_ret_borrowed(context, builder, nditerty, res)
+
+
+@lower_builtin("iternext", types.NumpyNdIterType)
+@iternext_impl(RefType.BORROWED)
+def iternext_numpy_nditer2(context, builder, sig, args, result):
+    [nditerty] = sig.args
+    [nditer] = args
+
+    nditer = make_nditer_cls(nditerty)(context, builder, value=nditer)
+    nditer.iternext_specific(context, builder, result)
+
+
+@lower_builtin(operator.eq, types.DType, types.DType)
+def dtype_eq_impl(context, builder, sig, args):
+    arg1, arg2 = sig.args
+    res = ir.Constant(ir.IntType(1), int(arg1 == arg2))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# ------------------------------------------------------------------------------
+# Numpy array constructors
+
+
+def _empty_nd_impl(context, builder, arrtype, shapes):
+    """Utility function used for allocating a new array during LLVM code
+    generation (lowering).  Given a target context, builder, array
+    type, and a tuple or list of lowered dimension sizes, returns a
+    LLVM value pointing at a Numba runtime allocated array.
+    """
+    arycls = make_array(arrtype)
+    ary = arycls(context, builder)
+
+    datatype = context.get_data_type(arrtype.dtype)
+    itemsize = context.get_constant(types.intp, get_itemsize(context, arrtype))
+
+    # compute array length
+    arrlen = context.get_constant(types.intp, 1)
+    overflow = Constant(ir.IntType(1), 0)
+    for s in shapes:
+        arrlen_mult = builder.smul_with_overflow(arrlen, s)
+        arrlen = builder.extract_value(arrlen_mult, 0)
+        overflow = builder.or_(overflow, builder.extract_value(arrlen_mult, 1))
+
+    if arrtype.ndim == 0:
+        strides = ()
+    elif arrtype.layout == "C":
+        strides = [itemsize]
+        for dimension_size in reversed(shapes[1:]):
+            strides.append(builder.mul(strides[-1], dimension_size))
+        strides = tuple(reversed(strides))
+    elif arrtype.layout == "F":
+        strides = [itemsize]
+        for dimension_size in shapes[:-1]:
+            strides.append(builder.mul(strides[-1], dimension_size))
+        strides = tuple(strides)
+    else:
+        raise NotImplementedError(
+            "Don't know how to allocate array with layout '{0}'.".format(
+                arrtype.layout
+            )
+        )
+
+    # Check overflow, numpy also does this after checking order
+    allocsize_mult = builder.smul_with_overflow(arrlen, itemsize)
+    allocsize = builder.extract_value(allocsize_mult, 0)
+    overflow = builder.or_(overflow, builder.extract_value(allocsize_mult, 1))
+
+    with builder.if_then(overflow, likely=False):
+        # Raise same error as numpy, see:
+        # https://github.com/numpy/numpy/blob/2a488fe76a0f732dc418d03b452caace161673da/numpy/core/src/multiarray/ctors.c#L1095-L1101    # noqa: E501
+        context.call_conv.return_user_exc(
+            builder,
+            ValueError,
+            (
+                "array is too big; `arr.size * arr.dtype.itemsize` is larger than"
+                " the maximum possible size.",
+            ),
+        )
+
+    dtype = arrtype.dtype
+    align_val = context.get_preferred_array_alignment(dtype)
+    align = context.get_constant(types.uint32, align_val)
+    args = (context.get_dummy_value(), allocsize, align)
+
+    mip = types.MemInfoPointer(types.voidptr)
+    arytypeclass = types.TypeRef(type(arrtype))
+    argtypes = signature(mip, arytypeclass, types.intp, types.uint32)
+
+    meminfo = context.compile_internal(builder, _call_allocator, argtypes, args)
+    data = context.nrt.meminfo_data(builder, meminfo)
+
+    intp_t = context.get_value_type(types.intp)
+    shape_array = cgutils.pack_array(builder, shapes, ty=intp_t)
+    strides_array = cgutils.pack_array(builder, strides, ty=intp_t)
+
+    populate_array(
+        ary,
+        data=builder.bitcast(data, datatype.as_pointer()),
+        shape=shape_array,
+        strides=strides_array,
+        itemsize=itemsize,
+        meminfo=meminfo,
+    )
+
+    return ary
+
+
+@overload_classmethod(types.Array, "_allocate")
+def _ol_array_allocate(cls, allocsize, align):
+    """Implements a Numba-only default target (cpu) classmethod on the array
+    type.
+    """
+
+    def impl(cls, allocsize, align):
+        return intrin_alloc(allocsize, align)
+
+    return impl
+
+
+def _call_allocator(arrtype, size, align):
+    """Trampoline to call the intrinsic used for allocation"""
+    return arrtype._allocate(size, align)
+
+
+@intrinsic
+def intrin_alloc(typingctx, allocsize, align):
+    """Intrinsic to call into the allocator for Array"""
+
+    def codegen(context, builder, signature, args):
+        [allocsize, align] = args
+        meminfo = context.nrt.meminfo_alloc_aligned(builder, allocsize, align)
+        return meminfo
+
+    mip = types.MemInfoPointer(types.voidptr)  # return untyped pointer
+    sig = signature(mip, allocsize, align)
+    return sig, codegen
+
+
+def _parse_shape(context, builder, ty, val):
+    """
+    Parse the shape argument to an array constructor.
+    """
+
+    def safecast_intp(context, builder, src_t, src):
+        """Cast src to intp only if value can be maintained"""
+        intp_t = context.get_value_type(types.intp)
+        intp_width = intp_t.width
+        intp_ir = ir.IntType(intp_width)
+        maxval = Constant(intp_ir, ((1 << intp_width - 1) - 1))
+        if src_t.width < intp_width:
+            res = builder.sext(src, intp_ir)
+        elif src_t.width >= intp_width:
+            is_larger = builder.icmp_signed(">", src, maxval)
+            with builder.if_then(is_larger, likely=False):
+                context.call_conv.return_user_exc(
+                    builder,
+                    ValueError,
+                    ("Cannot safely convert value to intp",),
+                )
+            if src_t.width > intp_width:
+                res = builder.trunc(src, intp_ir)
+            else:
+                res = src
+        return res
+
+    if isinstance(ty, types.Integer):
+        ndim = 1
+        passed_shapes = [context.cast(builder, val, ty, types.intp)]
+    else:
+        assert isinstance(ty, types.BaseTuple)
+        ndim = ty.count
+        passed_shapes = cgutils.unpack_tuple(builder, val, count=ndim)
+
+    shapes = []
+    for s in passed_shapes:
+        shapes.append(safecast_intp(context, builder, s.type, s))
+
+    zero = context.get_constant_generic(builder, types.intp, 0)
+    for dim in range(ndim):
+        is_neg = builder.icmp_signed("<", shapes[dim], zero)
+        with cgutils.if_unlikely(builder, is_neg):
+            context.call_conv.return_user_exc(
+                builder, ValueError, ("negative dimensions not allowed",)
+            )
+
+    return shapes
+
+
+def _parse_empty_args(context, builder, sig, args):
+    """
+    Parse the arguments of a np.empty(), np.zeros() or np.ones() call.
+    """
+    arrshapetype = sig.args[0]
+    arrshape = args[0]
+    arrtype = sig.return_type
+    return arrtype, _parse_shape(context, builder, arrshapetype, arrshape)
+
+
+def _parse_empty_like_args(context, builder, sig, args):
+    """
+    Parse the arguments of a np.empty_like(), np.zeros_like() or
+    np.ones_like() call.
+    """
+    arytype = sig.args[0]
+    if isinstance(arytype, types.Array):
+        ary = make_array(arytype)(context, builder, value=args[0])
+        shapes = cgutils.unpack_tuple(builder, ary.shape, count=arytype.ndim)
+        return sig.return_type, shapes
+    else:
+        return sig.return_type, ()
+
+
+def _check_const_str_dtype(fname, dtype):
+    if isinstance(dtype, types.UnicodeType):
+        msg = f"If np.{fname} dtype is a string it must be a string constant."
+        raise errors.TypingError(msg)
+
+
+@intrinsic
+def numpy_empty_nd(tyctx, ty_shape, ty_dtype, ty_retty_ref):
+    ty_retty = ty_retty_ref.instance_type
+    sig = ty_retty(ty_shape, ty_dtype, ty_retty_ref)
+
+    def codegen(cgctx, builder, sig, llargs):
+        arrtype, shapes = _parse_empty_args(cgctx, builder, sig, llargs)
+        ary = _empty_nd_impl(cgctx, builder, arrtype, shapes)
+        return ary._getvalue()
+
+    return sig, codegen
+
+
+@overload(np.empty)
+def ol_np_empty(shape, dtype=float):
+    _check_const_str_dtype("empty", dtype)
+    if (
+        dtype is float
+        or (isinstance(dtype, types.Function) and dtype.typing_key is float)
+        or is_nonelike(dtype)
+    ):  # default
+        nb_dtype = types.double
+    else:
+        nb_dtype = ty_parse_dtype(dtype)
+
+    ndim = ty_parse_shape(shape)
+    if nb_dtype is not None and ndim is not None:
+        retty = types.Array(dtype=nb_dtype, ndim=ndim, layout="C")
+
+        def impl(shape, dtype=float):
+            return numpy_empty_nd(shape, dtype, retty)
+
+        return impl
+    else:
+        msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
+        raise errors.TypingError(msg)
+
+
+@intrinsic
+def numpy_empty_like_nd(tyctx, ty_prototype, ty_dtype, ty_retty_ref):
+    ty_retty = ty_retty_ref.instance_type
+    sig = ty_retty(ty_prototype, ty_dtype, ty_retty_ref)
+
+    def codegen(cgctx, builder, sig, llargs):
+        arrtype, shapes = _parse_empty_like_args(cgctx, builder, sig, llargs)
+        ary = _empty_nd_impl(cgctx, builder, arrtype, shapes)
+        return ary._getvalue()
+
+    return sig, codegen
+
+
+@overload(np.empty_like)
+def ol_np_empty_like(arr, dtype=None):
+    _check_const_str_dtype("empty_like", dtype)
+    if not is_nonelike(dtype):
+        nb_dtype = ty_parse_dtype(dtype)
+    elif isinstance(arr, types.Array):
+        nb_dtype = arr.dtype
+    else:
+        nb_dtype = arr
+    if nb_dtype is not None:
+        if isinstance(arr, types.Array):
+            layout = arr.layout if arr.layout != "A" else "C"
+            retty = arr.copy(dtype=nb_dtype, layout=layout, readonly=False)
+        else:
+            retty = types.Array(nb_dtype, 0, "C")
+    else:
+        msg = (
+            "Cannot parse input types to function "
+            f"np.empty_like({arr}, {dtype})"
+        )
+        raise errors.TypingError(msg)
+
+    def impl(arr, dtype=None):
+        return numpy_empty_like_nd(arr, dtype, retty)
+
+    return impl
+
+
+@intrinsic
+def _zero_fill_array_method(tyctx, self):
+    sig = types.none(self)
+
+    def codegen(cgctx, builder, sig, llargs):
+        ary = make_array(sig.args[0])(cgctx, builder, llargs[0])
+        cgutils.memset(
+            builder, ary.data, builder.mul(ary.itemsize, ary.nitems), 0
+        )
+
+    return sig, codegen
+
+
+@overload_method(types.Array, "_zero_fill")
+def ol_array_zero_fill(self):
+    """Adds a `._zero_fill` method to zero fill an array using memset."""
+
+    def impl(self):
+        _zero_fill_array_method(self)
+
+    return impl
+
+
+@overload(np.zeros)
+def ol_np_zeros(shape, dtype=float):
+    _check_const_str_dtype("zeros", dtype)
+
+    def impl(shape, dtype=float):
+        arr = np.empty(shape, dtype=dtype)
+        arr._zero_fill()
+        return arr
+
+    return impl
+
+
+@overload(np.zeros_like)
+def ol_np_zeros_like(a, dtype=None):
+    _check_const_str_dtype("zeros_like", dtype)
+
+    # NumPy uses 'a' as the arg name for the array-like
+    def impl(a, dtype=None):
+        arr = np.empty_like(a, dtype=dtype)
+        arr._zero_fill()
+        return arr
+
+    return impl
+
+
+@overload(np.ones_like)
+def ol_np_ones_like(a, dtype=None):
+    _check_const_str_dtype("ones_like", dtype)
+
+    # NumPy uses 'a' as the arg name for the array-like
+    def impl(a, dtype=None):
+        arr = np.empty_like(a, dtype=dtype)
+        arr_flat = arr.flat
+        for idx in range(len(arr_flat)):
+            arr_flat[idx] = 1
+        return arr
+
+    return impl
+
+
+@overload(np.full)
+def impl_np_full(shape, fill_value, dtype=None):
+    _check_const_str_dtype("full", dtype)
+    if not is_nonelike(dtype):
+        nb_dtype = ty_parse_dtype(dtype)
+    else:
+        nb_dtype = fill_value
+
+    def full(shape, fill_value, dtype=None):
+        arr = np.empty(shape, nb_dtype)
+        arr_flat = arr.flat
+        for idx in range(len(arr_flat)):
+            arr_flat[idx] = fill_value
+        return arr
+
+    return full
+
+
+@overload(np.full_like)
+def impl_np_full_like(a, fill_value, dtype=None):
+    _check_const_str_dtype("full_like", dtype)
+
+    def full_like(a, fill_value, dtype=None):
+        arr = np.empty_like(a, dtype)
+        arr_flat = arr.flat
+        for idx in range(len(arr_flat)):
+            arr_flat[idx] = fill_value
+        return arr
+
+    return full_like
+
+
+@overload(np.ones)
+def ol_np_ones(shape, dtype=None):
+    # for some reason the NumPy default for dtype is None in the source but
+    # ends up as np.float64 by definition.
+    _check_const_str_dtype("ones", dtype)
+
+    def impl(shape, dtype=None):
+        arr = np.empty(shape, dtype=dtype)
+        arr_flat = arr.flat
+        for idx in range(len(arr_flat)):
+            arr_flat[idx] = 1
+        return arr
+
+    return impl
+
+
+@overload(np.identity)
+def impl_np_identity(n, dtype=None):
+    _check_const_str_dtype("identity", dtype)
+    if not is_nonelike(dtype):
+        nb_dtype = ty_parse_dtype(dtype)
+    else:
+        nb_dtype = types.double
+
+    def identity(n, dtype=None):
+        arr = np.zeros((n, n), nb_dtype)
+        for i in range(n):
+            arr[i, i] = 1
+        return arr
+
+    return identity
+
+
+def _eye_none_handler(N, M):
+    pass
+
+
+@extending.overload(_eye_none_handler)
+def _eye_none_handler_impl(N, M):
+    if isinstance(M, types.NoneType):
+
+        def impl(N, M):
+            return N
+    else:
+
+        def impl(N, M):
+            return M
+
+    return impl
+
+
+@extending.overload(np.eye)
+def numpy_eye(N, M=None, k=0, dtype=float):
+    if dtype is None or isinstance(dtype, types.NoneType):
+        dt = np.dtype(float)
+    elif isinstance(dtype, (types.DTypeSpec, types.Number)):
+        # dtype or instance of dtype
+        dt = as_dtype(getattr(dtype, "dtype", dtype))
+    else:
+        dt = np.dtype(dtype)
+
+    def impl(N, M=None, k=0, dtype=float):
+        _M = _eye_none_handler(N, M)
+        arr = np.zeros((N, _M), dt)
+        if k >= 0:
+            d = min(N, _M - k)
+            for i in range(d):
+                arr[i, i + k] = 1
+        else:
+            d = min(N + k, _M)
+            for i in range(d):
+                arr[i - k, i] = 1
+        return arr
+
+    return impl
+
+
+@overload(np.diag)
+def impl_np_diag(v, k=0):
+    if not type_can_asarray(v):
+        raise errors.TypingError('The argument "v" must be array-like')
+
+    if isinstance(v, types.Array):
+        if v.ndim not in (1, 2):
+            raise errors.NumbaTypeError("Input must be 1- or 2-d.")
+
+        def diag_impl(v, k=0):
+            if v.ndim == 1:
+                s = v.shape
+                n = s[0] + abs(k)
+                ret = np.zeros((n, n), v.dtype)
+                if k >= 0:
+                    for i in range(n - k):
+                        ret[i, k + i] = v[i]
+                else:
+                    for i in range(n + k):
+                        ret[i - k, i] = v[i]
+                return ret
+            else:  # 2-d
+                rows, cols = v.shape
+                if k < 0:
+                    rows = rows + k
+                if k > 0:
+                    cols = cols - k
+                n = max(min(rows, cols), 0)
+                ret = np.empty(n, v.dtype)
+                if k >= 0:
+                    for i in range(n):
+                        ret[i] = v[i, k + i]
+                else:
+                    for i in range(n):
+                        ret[i] = v[i - k, i]
+                return ret
+
+        return diag_impl
+
+
+@overload(np.indices)
+def numpy_indices(dimensions):
+    if not isinstance(dimensions, types.UniTuple):
+        msg = 'The argument "dimensions" must be a tuple of integers'
+        raise errors.TypingError(msg)
+
+    if not isinstance(dimensions.dtype, types.Integer):
+        msg = 'The argument "dimensions" must be a tuple of integers'
+        raise errors.TypingError(msg)
+
+    N = len(dimensions)
+    shape = (1,) * N
+
+    def impl(dimensions):
+        res = np.empty((N,) + dimensions, dtype=np.int64)
+        i = 0
+        for dim in dimensions:
+            idx = np.arange(dim, dtype=np.int64).reshape(
+                tuple_setitem(shape, i, dim)
+            )
+            res[i] = idx
+            i += 1
+
+        return res
+
+    return impl
+
+
+@overload(np.diagflat)
+def numpy_diagflat(v, k=0):
+    if not type_can_asarray(v):
+        msg = 'The argument "v" must be array-like'
+        raise errors.TypingError(msg)
+
+    if not isinstance(k, (int, types.Integer)):
+        msg = 'The argument "k" must be an integer'
+        raise errors.TypingError(msg)
+
+    def impl(v, k=0):
+        v = np.asarray(v)
+        v = v.ravel()
+        s = len(v)
+        abs_k = abs(k)
+        n = s + abs_k
+        res = np.zeros((n, n), v.dtype)
+        i = np.maximum(0, -k)
+        j = np.maximum(0, k)
+        for t in range(s):
+            res[i + t, j + t] = v[t]
+
+        return res
+
+    return impl
+
+
+def generate_getitem_setitem_with_axis(ndim, kind):
+    assert kind in ("getitem", "setitem")
+
+    if kind == "getitem":
+        fn = """
+            def _getitem(a, idx, axis):
+                if axis == 0:
+                    return a[idx, ...]
+        """
+        for i in range(1, ndim):
+            lst = (":",) * i
+            fn += f"""
+                elif axis == {i}:
+                    return a[{", ".join(lst)}, idx, ...]
+            """
+    else:
+        fn = """
+            def _setitem(a, idx, axis, vals):
+                if axis == 0:
+                    a[idx, ...] = vals
+        """
+
+        for i in range(1, ndim):
+            lst = (":",) * i
+            fn += f"""
+                elif axis == {i}:
+                    a[{", ".join(lst)}, idx, ...] = vals
+            """
+
+    fn = textwrap.dedent(fn)
+    exec(fn, globals())
+    fn = globals()[f"_{kind}"]
+    return register_jitable(fn)
+
+
+@overload(np.take)
+@overload_method(types.Array, "take")
+def numpy_take(a, indices, axis=None):
+    if cgutils.is_nonelike(axis):
+        if isinstance(a, types.Array) and isinstance(indices, types.Integer):
+
+            def take_impl(a, indices, axis=None):
+                if indices > (a.size - 1) or indices < -a.size:
+                    raise IndexError("Index out of bounds")
+                return a.ravel()[indices]
+
+            return take_impl
+
+        if isinstance(a, types.Array) and isinstance(indices, types.Array):
+            F_order = indices.layout == "F"
+
+            def take_impl(a, indices, axis=None):
+                ret = np.empty(indices.size, dtype=a.dtype)
+                if F_order:
+                    walker = indices.copy()  # get C order
+                else:
+                    walker = indices
+                it = np.nditer(walker)
+                i = 0
+                flat = a.ravel()
+                for x in it:
+                    if x > (a.size - 1) or x < -a.size:
+                        raise IndexError("Index out of bounds")
+                    ret[i] = flat[x]
+                    i = i + 1
+                return ret.reshape(indices.shape)
+
+            return take_impl
+
+        if isinstance(a, types.Array) and isinstance(
+            indices, (types.List, types.BaseTuple)
+        ):
+
+            def take_impl(a, indices, axis=None):
+                convert = np.array(indices)
+                return np.take(a, convert)
+
+            return take_impl
+    else:
+        if isinstance(a, types.Array) and isinstance(indices, types.Integer):
+            t = (0,) * (a.ndim - 1)
+
+            # np.squeeze is too hard to implement in Numba as the tuple "t"
+            # needs to be allocated beforehand we don't know it's size until
+            # code gets executed.
+            @register_jitable
+            def _squeeze(r, axis):
+                tup = tuple(t)
+                j = 0
+                assert axis < len(r.shape) and r.shape[axis] == 1, r.shape
+                for idx in range(len(r.shape)):
+                    s = r.shape[idx]
+                    if idx != axis:
+                        tup = tuple_setitem(tup, j, s)
+                        j += 1
+                return r.reshape(tup)
+
+            def take_impl(a, indices, axis=None):
+                r = np.take(a, (indices,), axis=axis)
+                if a.ndim == 1:
+                    return r[0]
+                if axis < 0:
+                    axis += a.ndim
+                return _squeeze(r, axis)
+
+            return take_impl
+
+        if isinstance(a, types.Array) and isinstance(
+            indices, (types.Array, types.List, types.BaseTuple)
+        ):
+            ndim = a.ndim
+
+            _getitem = generate_getitem_setitem_with_axis(ndim, "getitem")
+            _setitem = generate_getitem_setitem_with_axis(ndim, "setitem")
+
+            def take_impl(a, indices, axis=None):
+                if axis < 0:
+                    axis += a.ndim
+
+                if axis < 0 or axis >= a.ndim:
+                    msg = (
+                        f"axis {axis} is out of bounds for array "
+                        f"of dimension {a.ndim}"
+                    )
+                    raise ValueError(msg)
+
+                shape = tuple_setitem(a.shape, axis, len(indices))
+                out = np.empty(shape, dtype=a.dtype)
+                for i in range(len(indices)):
+                    y = _getitem(a, indices[i], axis)
+                    _setitem(out, i, axis, y)
+                return out
+
+            return take_impl
+
+
+def _arange_dtype(*args):
+    bounds = [a for a in args if not isinstance(a, types.NoneType)]
+
+    if any(isinstance(a, types.Complex) for a in bounds):
+        dtype = types.complex128
+    elif any(isinstance(a, types.Float) for a in bounds):
+        dtype = types.float64
+    else:
+        # `np.arange(10).dtype` is always `np.dtype(int)`, aka `np.int_`, which
+        # in all released versions of numpy corresponds to the C `long` type.
+        # Windows 64 is broken by default here because Numba (as of 0.47) does
+        # not differentiate between Python and NumPy integers, so a `typeof(1)`
+        # on w64 is `int64`, i.e. `intp`. This means an arange(<some int>) will
+        # be typed as arange(int64) and the following will yield int64 opposed
+        # to int32. Example: without a load of analysis to work out of the args
+        # were wrapped in NumPy int*() calls it's not possible to detect the
+        # difference between `np.arange(10)` and `np.arange(np.int64(10)`.
+        NPY_TY = getattr(types, "int%s" % (8 * np.dtype(int).itemsize))
+
+        # unliteral these types such that `max` works.
+        unliteral_bounds = [types.unliteral(x) for x in bounds]
+        dtype = max(
+            unliteral_bounds
+            + [
+                NPY_TY,
+            ]
+        )
+
+    return dtype
+
+
+@overload(np.arange)
+def np_arange(start, /, stop=None, step=None, dtype=None):
+    if isinstance(stop, types.Optional):
+        stop = stop.type
+    if isinstance(step, types.Optional):
+        step = step.type
+    if isinstance(dtype, types.Optional):
+        dtype = dtype.type
+
+    if stop is None:
+        stop = types.none
+    if step is None:
+        step = types.none
+    if dtype is None:
+        dtype = types.none
+
+    if (
+        not isinstance(start, types.Number)
+        or not isinstance(stop, (types.NoneType, types.Number))
+        or not isinstance(step, (types.NoneType, types.Number))
+        or not isinstance(dtype, (types.NoneType, types.DTypeSpec))
+    ):
+        return
+
+    if isinstance(dtype, types.NoneType):
+        true_dtype = _arange_dtype(start, stop, step)
+    else:
+        true_dtype = dtype.dtype
+
+    use_complex = any(
+        [isinstance(x, types.Complex) for x in (start, stop, step)]
+    )
+
+    start_value = getattr(start, "literal_value", None)
+    stop_value = getattr(stop, "literal_value", None)
+    step_value = getattr(step, "literal_value", None)
+
+    def impl(start, /, stop=None, step=None, dtype=None):
+        # Allow for improved performance if given literal arguments.
+        lit_start = start_value if start_value is not None else start
+        lit_stop = stop_value if stop_value is not None else stop
+        lit_step = step_value if step_value is not None else step
+
+        _step = lit_step if lit_step is not None else 1
+        if lit_stop is None:
+            _start, _stop = 0, lit_start
+        else:
+            _start, _stop = lit_start, lit_stop
+
+        if _step == 0:
+            raise ValueError("Maximum allowed size exceeded")
+
+        nitems_c = (_stop - _start) / _step
+        nitems_r = int(math.ceil(nitems_c.real))
+
+        # Binary operator needed for compiler branch pruning.
+        if use_complex is True:
+            nitems_i = int(math.ceil(nitems_c.imag))
+            nitems = max(min(nitems_i, nitems_r), 0)
+        else:
+            nitems = max(nitems_r, 0)
+        arr = np.empty(nitems, true_dtype)
+        val = _start
+        for i in range(nitems):
+            arr[i] = val + (i * _step)
+        return arr
+
+    return impl
+
+
+@overload(np.linspace)
+def numpy_linspace(start, stop, num=50):
+    if not all(isinstance(arg, types.Number) for arg in [start, stop]):
+        return
+
+    if not isinstance(num, (int, types.Integer)):
+        msg = 'The argument "num" must be an integer'
+        raise errors.TypingError(msg)
+
+    if any(isinstance(arg, types.Complex) for arg in [start, stop]):
+        dtype = types.complex128
+    else:
+        dtype = types.float64
+
+    # Implementation based on https://github.com/numpy/numpy/blob/v1.20.0/numpy/core/function_base.py#L24 # noqa: E501
+    def linspace(start, stop, num=50):
+        arr = np.empty(num, dtype)
+        # The multiply by 1.0 mirrors
+        # https://github.com/numpy/numpy/blob/v1.20.0/numpy/core/function_base.py#L125-L128  # noqa: E501
+        # the side effect of this is important... start and stop become the same
+        # type as `dtype` i.e. 64/128 bits wide (float/complex). This is
+        # important later when used in the `np.divide`.
+        start = start * 1.0
+        stop = stop * 1.0
+        if num == 0:
+            return arr
+        div = num - 1
+        if div > 0:
+            delta = stop - start
+            step = np.divide(delta, div)
+            for i in range(0, num):
+                arr[i] = start + (i * step)
+        else:
+            arr[0] = start
+        if num > 1:
+            arr[-1] = stop
+        return arr
+
+    return linspace
+
+
+def _array_copy(context, builder, sig, args):
+    """
+    Array copy.
+    """
+    arytype = sig.args[0]
+    ary = make_array(arytype)(context, builder, value=args[0])
+    shapes = cgutils.unpack_tuple(builder, ary.shape)
+
+    rettype = sig.return_type
+    ret = _empty_nd_impl(context, builder, rettype, shapes)
+
+    src_data = ary.data
+    dest_data = ret.data
+
+    assert rettype.layout in "CF"
+    if arytype.layout == rettype.layout:
+        # Fast path: memcpy
+        cgutils.raw_memcpy(
+            builder, dest_data, src_data, ary.nitems, ary.itemsize, align=1
+        )
+
+    else:
+        src_strides = cgutils.unpack_tuple(builder, ary.strides)
+        dest_strides = cgutils.unpack_tuple(builder, ret.strides)
+        intp_t = context.get_value_type(types.intp)
+
+        with cgutils.loop_nest(builder, shapes, intp_t) as indices:
+            src_ptr = cgutils.get_item_pointer2(
+                context,
+                builder,
+                src_data,
+                shapes,
+                src_strides,
+                arytype.layout,
+                indices,
+            )
+            dest_ptr = cgutils.get_item_pointer2(
+                context,
+                builder,
+                dest_data,
+                shapes,
+                dest_strides,
+                rettype.layout,
+                indices,
+            )
+            builder.store(builder.load(src_ptr), dest_ptr)
+
+    return impl_ret_new_ref(context, builder, sig.return_type, ret._getvalue())
+
+
+@intrinsic
+def _array_copy_intrinsic(typingctx, a):
+    assert isinstance(a, types.Array)
+    layout = "F" if a.layout == "F" else "C"
+    ret = a.copy(layout=layout, readonly=False)
+    sig = ret(a)
+    return sig, _array_copy
+
+
+@lower_builtin("array.copy", types.Array)
+def array_copy(context, builder, sig, args):
+    return _array_copy(context, builder, sig, args)
+
+
+@overload(np.copy)
+def impl_numpy_copy(a):
+    if isinstance(a, types.Array):
+
+        def numpy_copy(a):
+            return _array_copy_intrinsic(a)
+
+    return numpy_copy
+
+
+def _as_layout_array(context, builder, sig, args, output_layout):
+    """
+    Common logic for layout conversion function;
+    e.g. ascontiguousarray and asfortranarray
+    """
+    retty = sig.return_type
+    aryty = sig.args[0]
+    assert retty.layout == output_layout, "return-type has incorrect layout"
+
+    if aryty.ndim == 0:
+        # 0-dim input => asfortranarray() returns a 1-dim array
+        assert retty.ndim == 1
+        ary = make_array(aryty)(context, builder, value=args[0])
+        ret = make_array(retty)(context, builder)
+
+        shape = context.get_constant_generic(
+            builder,
+            types.UniTuple(types.intp, 1),
+            (1,),
+        )
+        strides = context.make_tuple(
+            builder, types.UniTuple(types.intp, 1), (ary.itemsize,)
+        )
+        populate_array(
+            ret, ary.data, shape, strides, ary.itemsize, ary.meminfo, ary.parent
+        )
+        return impl_ret_borrowed(context, builder, retty, ret._getvalue())
+
+    elif retty.layout == aryty.layout or (
+        aryty.ndim == 1 and aryty.layout in "CF"
+    ):
+        # 1-dim contiguous input => return the same array
+        return impl_ret_borrowed(context, builder, retty, args[0])
+
+    else:
+        if aryty.layout == "A":
+            # There's still chance the array is in contiguous layout,
+            # just that we don't know at compile time.
+            # We can do a runtime check.
+
+            # Prepare and call is_contiguous or is_fortran
+            assert output_layout in "CF"
+            check_func = is_contiguous if output_layout == "C" else is_fortran
+            is_contig = _call_contiguous_check(
+                check_func, context, builder, aryty, args[0]
+            )
+            with builder.if_else(is_contig) as (then, orelse):
+                # If the array is already contiguous, just return it
+                with then:
+                    out_then = impl_ret_borrowed(
+                        context, builder, retty, args[0]
+                    )
+                    then_blk = builder.block
+                # Otherwise, copy to a new contiguous region
+                with orelse:
+                    out_orelse = _array_copy(context, builder, sig, args)
+                    orelse_blk = builder.block
+            # Phi node for the return value
+            ret_phi = builder.phi(out_then.type)
+            ret_phi.add_incoming(out_then, then_blk)
+            ret_phi.add_incoming(out_orelse, orelse_blk)
+            return ret_phi
+
+        else:
+            # Return a copy with the right layout
+            return _array_copy(context, builder, sig, args)
+
+
+@intrinsic
+def _as_layout_array_intrinsic(typingctx, a, output_layout):
+    if not isinstance(output_layout, types.StringLiteral):
+        raise errors.RequireLiteralValue(output_layout)
+
+    ret = a.copy(layout=output_layout.literal_value, ndim=max(a.ndim, 1))
+    sig = ret(a, output_layout)
+
+    return sig, lambda c, b, s, a: _as_layout_array(
+        c, b, s, a, output_layout=output_layout.literal_value
+    )
+
+
+@overload(np.ascontiguousarray)
+def array_ascontiguousarray(a):
+    if not type_can_asarray(a):
+        raise errors.TypingError('The argument "a" must be array-like')
+
+    if isinstance(
+        a,
+        (
+            types.Number,
+            types.Boolean,
+        ),
+    ):
+
+        def impl(a):
+            return np.ascontiguousarray(np.array(a))
+    elif isinstance(a, types.Array):
+
+        def impl(a):
+            return _as_layout_array_intrinsic(a, "C")
+
+    return impl
+
+
+@overload(np.asfortranarray)
+def array_asfortranarray(a):
+    if not type_can_asarray(a):
+        raise errors.TypingError('The argument "a" must be array-like')
+
+    if isinstance(
+        a,
+        (
+            types.Number,
+            types.Boolean,
+        ),
+    ):
+
+        def impl(a):
+            return np.asfortranarray(np.array(a))
+
+        return impl
+    elif isinstance(a, types.Array):
+
+        def impl(a):
+            return _as_layout_array_intrinsic(a, "F")
+
+        return impl
+
+
+@lower_builtin("array.astype", types.Array, types.DTypeSpec)
+@lower_builtin("array.astype", types.Array, types.StringLiteral)
+def array_astype(context, builder, sig, args):
+    arytype = sig.args[0]
+    ary = make_array(arytype)(context, builder, value=args[0])
+    shapes = cgutils.unpack_tuple(builder, ary.shape)
+
+    rettype = sig.return_type
+    ret = _empty_nd_impl(context, builder, rettype, shapes)
+
+    src_data = ary.data
+    dest_data = ret.data
+
+    src_strides = cgutils.unpack_tuple(builder, ary.strides)
+    dest_strides = cgutils.unpack_tuple(builder, ret.strides)
+    intp_t = context.get_value_type(types.intp)
+
+    with cgutils.loop_nest(builder, shapes, intp_t) as indices:
+        src_ptr = cgutils.get_item_pointer2(
+            context,
+            builder,
+            src_data,
+            shapes,
+            src_strides,
+            arytype.layout,
+            indices,
+        )
+        dest_ptr = cgutils.get_item_pointer2(
+            context,
+            builder,
+            dest_data,
+            shapes,
+            dest_strides,
+            rettype.layout,
+            indices,
+        )
+        item = load_item(context, builder, arytype, src_ptr)
+        item = context.cast(builder, item, arytype.dtype, rettype.dtype)
+        store_item(context, builder, rettype, item, dest_ptr)
+
+    return impl_ret_new_ref(context, builder, sig.return_type, ret._getvalue())
+
+
+@intrinsic
+def np_frombuffer(typingctx, buffer, dtype, retty):
+    ty = retty.instance_type
+    sig = ty(buffer, dtype, retty)
+
+    def codegen(context, builder, sig, args):
+        bufty = sig.args[0]
+        aryty = sig.return_type
+
+        buf = make_array(bufty)(context, builder, value=args[0])
+        out_ary_ty = make_array(aryty)
+        out_ary = out_ary_ty(context, builder)
+        out_datamodel = out_ary._datamodel
+
+        itemsize = get_itemsize(context, aryty)
+        ll_itemsize = Constant(buf.itemsize.type, itemsize)
+        nbytes = builder.mul(buf.nitems, buf.itemsize)
+
+        # Check that the buffer size is compatible
+        rem = builder.srem(nbytes, ll_itemsize)
+        is_incompatible = cgutils.is_not_null(builder, rem)
+        with builder.if_then(is_incompatible, likely=False):
+            msg = "buffer size must be a multiple of element size"
+            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+
+        shape = cgutils.pack_array(builder, [builder.sdiv(nbytes, ll_itemsize)])
+        strides = cgutils.pack_array(builder, [ll_itemsize])
+        data = builder.bitcast(
+            buf.data, context.get_value_type(out_datamodel.get_type("data"))
+        )
+
+        populate_array(
+            out_ary,
+            data=data,
+            shape=shape,
+            strides=strides,
+            itemsize=ll_itemsize,
+            meminfo=buf.meminfo,
+            parent=buf.parent,
+        )
+
+        res = out_ary._getvalue()
+        return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+    return sig, codegen
+
+
+@overload(np.frombuffer)
+def impl_np_frombuffer(buffer, dtype=float):
+    _check_const_str_dtype("frombuffer", dtype)
+
+    if not isinstance(buffer, types.Buffer) or buffer.layout != "C":
+        msg = f'Argument "buffer" must be buffer-like. Got {buffer}'
+        raise errors.TypingError(msg)
+
+    if (
+        dtype is float
+        or (isinstance(dtype, types.Function) and dtype.typing_key is float)
+        or is_nonelike(dtype)
+    ):  # default
+        nb_dtype = types.double
+    else:
+        nb_dtype = ty_parse_dtype(dtype)
+
+    if nb_dtype is not None:
+        retty = types.Array(
+            dtype=nb_dtype, ndim=1, layout="C", readonly=not buffer.mutable
+        )
+    else:
+        msg = (
+            "Cannot parse input types to function "
+            f"np.frombuffer({buffer}, {dtype})"
+        )
+        raise errors.TypingError(msg)
+
+    def impl(buffer, dtype=float):
+        return np_frombuffer(buffer, dtype, retty)
+
+    return impl
+
+
+@overload(carray)
+def impl_carray(ptr, shape, dtype=None):
+    if is_nonelike(dtype):
+        intrinsic_cfarray = get_cfarray_intrinsic("C", None)
+
+        def impl(ptr, shape, dtype=None):
+            return intrinsic_cfarray(ptr, shape)
+
+        return impl
+    elif isinstance(dtype, types.DTypeSpec):
+        intrinsic_cfarray = get_cfarray_intrinsic("C", dtype)
+
+        def impl(ptr, shape, dtype=None):
+            return intrinsic_cfarray(ptr, shape)
+
+        return impl
+
+
+@overload(farray)
+def impl_farray(ptr, shape, dtype=None):
+    if is_nonelike(dtype):
+        intrinsic_cfarray = get_cfarray_intrinsic("F", None)
+
+        def impl(ptr, shape, dtype=None):
+            return intrinsic_cfarray(ptr, shape)
+
+        return impl
+    elif isinstance(dtype, types.DTypeSpec):
+        intrinsic_cfarray = get_cfarray_intrinsic("F", dtype)
+
+        def impl(ptr, shape, dtype=None):
+            return intrinsic_cfarray(ptr, shape)
+
+        return impl
+
+
+def get_cfarray_intrinsic(layout, dtype_):
+    @intrinsic
+    def intrinsic_cfarray(typingctx, ptr, shape):
+        if ptr is types.voidptr:
+            ptr_dtype = None
+        elif isinstance(ptr, types.CPointer):
+            ptr_dtype = ptr.dtype
+        else:
+            msg = f"pointer argument expected, got '{ptr}'"
+            raise errors.NumbaTypeError(msg)
+
+        if dtype_ is None:
+            if ptr_dtype is None:
+                msg = "explicit dtype required for void* argument"
+                raise errors.NumbaTypeError(msg)
+            dtype = ptr_dtype
+        elif isinstance(dtype_, types.DTypeSpec):
+            dtype = dtype_.dtype
+            if ptr_dtype is not None and dtype != ptr_dtype:
+                msg = f"mismatching dtype '{dtype}' for pointer type '{ptr}'"
+                raise errors.NumbaTypeError(msg)
+        else:
+            msg = f"invalid dtype spec '{dtype_}'"
+            raise errors.NumbaTypeError(msg)
+
+        ndim = ty_parse_shape(shape)
+        if ndim is None:
+            msg = f"invalid shape '{shape}'"
+            raise errors.NumbaTypeError(msg)
+
+        retty = types.Array(dtype, ndim, layout)
+        sig = signature(retty, ptr, shape)
+        return sig, np_cfarray
+
+    return intrinsic_cfarray
+
+
+def np_cfarray(context, builder, sig, args):
+    """
+    numba.numpy_support.carray(...) and
+    numba.numpy_support.farray(...).
+    """
+    ptrty, shapety = sig.args[:2]
+    ptr, shape = args[:2]
+
+    aryty = sig.return_type
+    assert aryty.layout in "CF"
+
+    out_ary = make_array(aryty)(context, builder)
+
+    itemsize = get_itemsize(context, aryty)
+    ll_itemsize = cgutils.intp_t(itemsize)
+
+    if isinstance(shapety, types.BaseTuple):
+        shapes = cgutils.unpack_tuple(builder, shape)
+    else:
+        shapety = (shapety,)
+        shapes = (shape,)
+    shapes = [
+        context.cast(builder, value, fromty, types.intp)
+        for fromty, value in zip(shapety, shapes)
+    ]
+
+    off = ll_itemsize
+    strides = []
+    if aryty.layout == "F":
+        for s in shapes:
+            strides.append(off)
+            off = builder.mul(off, s)
+    else:
+        for s in reversed(shapes):
+            strides.append(off)
+            off = builder.mul(off, s)
+        strides.reverse()
+
+    data = builder.bitcast(ptr, context.get_data_type(aryty.dtype).as_pointer())
+
+    populate_array(
+        out_ary,
+        data=data,
+        shape=shapes,
+        strides=strides,
+        itemsize=ll_itemsize,
+        # Array is not memory-managed
+        meminfo=None,
+    )
+
+    res = out_ary._getvalue()
+    return impl_ret_new_ref(context, builder, sig.return_type, res)
+
+
+def _get_seq_size(context, builder, seqty, seq):
+    if isinstance(seqty, types.BaseTuple):
+        return context.get_constant(types.intp, len(seqty))
+    elif isinstance(seqty, types.Sequence):
+        len_impl = context.get_function(
+            len,
+            signature(
+                types.intp,
+                seqty,
+            ),
+        )
+        return len_impl(builder, (seq,))
+    else:
+        assert 0
+
+
+def _get_borrowing_getitem(context, seqty):
+    """
+    Return a getitem() implementation that doesn't incref its result.
+    """
+    retty = seqty.dtype
+    getitem_impl = context.get_function(
+        operator.getitem, signature(retty, seqty, types.intp)
+    )
+
+    def wrap(builder, args):
+        ret = getitem_impl(builder, args)
+        if context.enable_nrt:
+            context.nrt.decref(builder, retty, ret)
+        return ret
+
+    return wrap
+
+
+def compute_sequence_shape(context, builder, ndim, seqty, seq):
+    """
+    Compute the likely shape of a nested sequence (possibly 0d).
+    """
+    intp_t = context.get_value_type(types.intp)
+    zero = Constant(intp_t, 0)
+
+    def get_first_item(seqty, seq):
+        if isinstance(seqty, types.BaseTuple):
+            if len(seqty) == 0:
+                return None, None
+            else:
+                return seqty[0], builder.extract_value(seq, 0)
+        else:
+            getitem_impl = _get_borrowing_getitem(context, seqty)
+            return seqty.dtype, getitem_impl(builder, (seq, zero))
+
+    # Compute shape by traversing the first element of each nested
+    # sequence
+    shapes = []
+    innerty, inner = seqty, seq
+
+    for i in range(ndim):
+        if i > 0:
+            innerty, inner = get_first_item(innerty, inner)
+        shapes.append(_get_seq_size(context, builder, innerty, inner))
+
+    return tuple(shapes)
+
+
+def check_sequence_shape(context, builder, seqty, seq, shapes):
+    """
+    Check the nested sequence matches the given *shapes*.
+    """
+
+    def _fail():
+        context.call_conv.return_user_exc(
+            builder, ValueError, ("incompatible sequence shape",)
+        )
+
+    def check_seq_size(seqty, seq, shapes):
+        if len(shapes) == 0:
+            return
+
+        size = _get_seq_size(context, builder, seqty, seq)
+        expected = shapes[0]
+        mismatch = builder.icmp_signed("!=", size, expected)
+        with builder.if_then(mismatch, likely=False):
+            _fail()
+
+        if len(shapes) == 1:
+            return
+
+        if isinstance(seqty, types.Sequence):
+            getitem_impl = _get_borrowing_getitem(context, seqty)
+            with cgutils.for_range(builder, size) as loop:
+                innerty = seqty.dtype
+                inner = getitem_impl(builder, (seq, loop.index))
+                check_seq_size(innerty, inner, shapes[1:])
+
+        elif isinstance(seqty, types.BaseTuple):
+            for i in range(len(seqty)):
+                innerty = seqty[i]
+                inner = builder.extract_value(seq, i)
+                check_seq_size(innerty, inner, shapes[1:])
+
+        else:
+            assert 0, seqty
+
+    check_seq_size(seqty, seq, shapes)
+
+
+def assign_sequence_to_array(
+    context, builder, data, shapes, strides, arrty, seqty, seq
+):
+    """
+    Assign a nested sequence contents to an array.  The shape must match
+    the sequence's structure.
+    """
+
+    def assign_item(indices, valty, val):
+        ptr = cgutils.get_item_pointer2(
+            context,
+            builder,
+            data,
+            shapes,
+            strides,
+            arrty.layout,
+            indices,
+            wraparound=False,
+        )
+        val = context.cast(builder, val, valty, arrty.dtype)
+        store_item(context, builder, arrty, val, ptr)
+
+    def assign(seqty, seq, shapes, indices):
+        if len(shapes) == 0:
+            assert not isinstance(seqty, (types.Sequence, types.BaseTuple))
+            assign_item(indices, seqty, seq)
+            return
+
+        size = shapes[0]
+
+        if isinstance(seqty, types.Sequence):
+            getitem_impl = _get_borrowing_getitem(context, seqty)
+            with cgutils.for_range(builder, size) as loop:
+                innerty = seqty.dtype
+                inner = getitem_impl(builder, (seq, loop.index))
+                assign(innerty, inner, shapes[1:], indices + (loop.index,))
+
+        elif isinstance(seqty, types.BaseTuple):
+            for i in range(len(seqty)):
+                innerty = seqty[i]
+                inner = builder.extract_value(seq, i)
+                index = context.get_constant(types.intp, i)
+                assign(innerty, inner, shapes[1:], indices + (index,))
+
+        else:
+            assert 0, seqty
+
+    assign(seqty, seq, shapes, ())
+
+
+def np_array_typer(typingctx, object, dtype):
+    ndim, seq_dtype = _parse_nested_sequence(typingctx, object)
+    if is_nonelike(dtype):
+        dtype = seq_dtype
+    else:
+        dtype = ty_parse_dtype(dtype)
+        if dtype is None:
+            return
+    return types.Array(dtype, ndim, "C")
+
+
+@intrinsic
+def np_array(typingctx, obj, dtype):
+    _check_const_str_dtype("array", dtype)
+    ret = np_array_typer(typingctx, obj, dtype)
+    sig = ret(obj, dtype)
+
+    def codegen(context, builder, sig, args):
+        arrty = sig.return_type
+        ndim = arrty.ndim
+        seqty = sig.args[0]
+        seq = args[0]
+
+        shapes = compute_sequence_shape(context, builder, ndim, seqty, seq)
+        assert len(shapes) == ndim
+
+        check_sequence_shape(context, builder, seqty, seq, shapes)
+        arr = _empty_nd_impl(context, builder, arrty, shapes)
+        assign_sequence_to_array(
+            context, builder, arr.data, shapes, arr.strides, arrty, seqty, seq
+        )
+
+        return impl_ret_new_ref(
+            context, builder, sig.return_type, arr._getvalue()
+        )
+
+    return sig, codegen
+
+
+@overload(np.array)
+def impl_np_array(object, dtype=None):
+    _check_const_str_dtype("array", dtype)
+    if not type_can_asarray(object):
+        raise errors.TypingError('The argument "object" must be array-like')
+    if not is_nonelike(dtype) and ty_parse_dtype(dtype) is None:
+        msg = 'The argument "dtype" must be a data-type if it is provided'
+        raise errors.TypingError(msg)
+
+    def impl(object, dtype=None):
+        return np_array(object, dtype)
+
+    return impl
+
+
+def _normalize_axis(context, builder, func_name, ndim, axis):
+    zero = axis.type(0)
+    ll_ndim = axis.type(ndim)
+
+    # Normalize negative axis
+    is_neg_axis = builder.icmp_signed("<", axis, zero)
+    axis = builder.select(is_neg_axis, builder.add(axis, ll_ndim), axis)
+
+    # Check axis for bounds
+    axis_out_of_bounds = builder.or_(
+        builder.icmp_signed("<", axis, zero),
+        builder.icmp_signed(">=", axis, ll_ndim),
+    )
+    with builder.if_then(axis_out_of_bounds, likely=False):
+        msg = "%s(): axis out of bounds" % func_name
+        context.call_conv.return_user_exc(builder, IndexError, (msg,))
+
+    return axis
+
+
+def _insert_axis_in_shape(context, builder, orig_shape, ndim, axis):
+    """
+    Compute shape with the new axis inserted
+    e.g. given original shape (2, 3, 4) and axis=2,
+    the returned new shape is (2, 3, 1, 4).
+    """
+    assert len(orig_shape) == ndim - 1
+
+    ll_shty = ir.ArrayType(cgutils.intp_t, ndim)
+    shapes = cgutils.alloca_once(builder, ll_shty)
+
+    one = cgutils.intp_t(1)
+
+    # 1. copy original sizes at appropriate places
+    for dim in range(ndim - 1):
+        ll_dim = cgutils.intp_t(dim)
+        after_axis = builder.icmp_signed(">=", ll_dim, axis)
+        sh = orig_shape[dim]
+        idx = builder.select(after_axis, builder.add(ll_dim, one), ll_dim)
+        builder.store(sh, cgutils.gep_inbounds(builder, shapes, 0, idx))
+
+    # 2. insert new size (1) at axis dimension
+    builder.store(one, cgutils.gep_inbounds(builder, shapes, 0, axis))
+
+    return cgutils.unpack_tuple(builder, builder.load(shapes))
+
+
+def _insert_axis_in_strides(context, builder, orig_strides, ndim, axis):
+    """
+    Same as _insert_axis_in_shape(), but with a strides array.
+    """
+    assert len(orig_strides) == ndim - 1
+
+    ll_shty = ir.ArrayType(cgutils.intp_t, ndim)
+    strides = cgutils.alloca_once(builder, ll_shty)
+
+    one = cgutils.intp_t(1)
+    zero = cgutils.intp_t(0)
+
+    # 1. copy original strides at appropriate places
+    for dim in range(ndim - 1):
+        ll_dim = cgutils.intp_t(dim)
+        after_axis = builder.icmp_signed(">=", ll_dim, axis)
+        idx = builder.select(after_axis, builder.add(ll_dim, one), ll_dim)
+        builder.store(
+            orig_strides[dim], cgutils.gep_inbounds(builder, strides, 0, idx)
+        )
+
+    # 2. insert new stride at axis dimension
+    # (the value is indifferent for a 1-sized dimension, we use 0)
+    builder.store(zero, cgutils.gep_inbounds(builder, strides, 0, axis))
+
+    return cgutils.unpack_tuple(builder, builder.load(strides))
+
+
+def expand_dims(context, builder, sig, args, axis):
+    """
+    np.expand_dims() with the given axis.
+    """
+    retty = sig.return_type
+    ndim = retty.ndim
+    arrty = sig.args[0]
+
+    arr = make_array(arrty)(context, builder, value=args[0])
+    ret = make_array(retty)(context, builder)
+
+    shapes = cgutils.unpack_tuple(builder, arr.shape)
+    strides = cgutils.unpack_tuple(builder, arr.strides)
+
+    new_shapes = _insert_axis_in_shape(context, builder, shapes, ndim, axis)
+    new_strides = _insert_axis_in_strides(context, builder, strides, ndim, axis)
+
+    populate_array(
+        ret,
+        data=arr.data,
+        shape=new_shapes,
+        strides=new_strides,
+        itemsize=arr.itemsize,
+        meminfo=arr.meminfo,
+        parent=arr.parent,
+    )
+
+    return ret._getvalue()
+
+
+@intrinsic
+def np_expand_dims(typingctx, a, axis):
+    layout = a.layout if a.ndim <= 1 else "A"
+    ret = a.copy(ndim=a.ndim + 1, layout=layout)
+    sig = ret(a, axis)
+
+    def codegen(context, builder, sig, args):
+        axis = context.cast(builder, args[1], sig.args[1], types.intp)
+        axis = _normalize_axis(
+            context, builder, "np.expand_dims", sig.return_type.ndim, axis
+        )
+
+        ret = expand_dims(context, builder, sig, args, axis)
+        return impl_ret_borrowed(context, builder, sig.return_type, ret)
+
+    return sig, codegen
+
+
+@overload(np.expand_dims)
+def impl_np_expand_dims(a, axis):
+    if not isinstance(a, types.Array):
+        msg = f'First argument "a" must be an array. Got {a}'
+        raise errors.TypingError(msg)
+
+    if not isinstance(axis, types.Integer):
+        msg = f'Argument "axis" must be an integer. Got {axis}'
+        raise errors.TypingError(msg)
+
+    def impl(a, axis):
+        return np_expand_dims(a, axis)
+
+    return impl
+
+
+def _atleast_nd(minimum, axes):
+    @intrinsic
+    def impl(typingcontext, *args):
+        arrtys = args
+        rettys = [arg.copy(ndim=max(arg.ndim, minimum)) for arg in args]
+
+        def codegen(context, builder, sig, args):
+            transform = _atleast_nd_transform(minimum, axes)
+            arrs = cgutils.unpack_tuple(builder, args[0])
+
+            rets = [
+                transform(context, builder, arr, arrty, retty)
+                for arr, arrty, retty in zip(arrs, arrtys, rettys)
+            ]
+
+            if len(rets) > 1:
+                ret = context.make_tuple(builder, sig.return_type, rets)
+            else:
+                ret = rets[0]
+            return impl_ret_borrowed(context, builder, sig.return_type, ret)
+
+        return signature(
+            types.Tuple(rettys) if len(rettys) > 1 else rettys[0],
+            types.StarArgTuple.from_types(args),
+        ), codegen
+
+    return lambda *args: impl(*args)
+
+
+def _atleast_nd_transform(min_ndim, axes):
+    """
+    Return a callback successively inserting 1-sized dimensions at the
+    following axes.
+    """
+    assert min_ndim == len(axes)
+
+    def transform(context, builder, arr, arrty, retty):
+        for i in range(min_ndim):
+            ndim = i + 1
+            if arrty.ndim < ndim:
+                axis = cgutils.intp_t(axes[i])
+                newarrty = arrty.copy(ndim=arrty.ndim + 1)
+                arr = expand_dims(
+                    context,
+                    builder,
+                    typing.signature(newarrty, arrty),
+                    (arr,),
+                    axis,
+                )
+                arrty = newarrty
+
+        return arr
+
+    return transform
+
+
+@overload(np.atleast_1d)
+def np_atleast_1d(*args):
+    if all(isinstance(arg, types.Array) for arg in args):
+        return _atleast_nd(1, [0])
+
+
+@overload(np.atleast_2d)
+def np_atleast_2d(*args):
+    if all(isinstance(arg, types.Array) for arg in args):
+        return _atleast_nd(2, [0, 0])
+
+
+@overload(np.atleast_3d)
+def np_atleast_3d(*args):
+    if all(isinstance(arg, types.Array) for arg in args):
+        return _atleast_nd(3, [0, 0, 2])
+
+
+def _do_concatenate(
+    context,
+    builder,
+    axis,
+    arrtys,
+    arrs,
+    arr_shapes,
+    arr_strides,
+    retty,
+    ret_shapes,
+):
+    """
+    Concatenate arrays along the given axis.
+    """
+    assert len(arrtys) == len(arrs) == len(arr_shapes) == len(arr_strides)
+
+    zero = cgutils.intp_t(0)
+
+    # Allocate return array
+    ret = _empty_nd_impl(context, builder, retty, ret_shapes)
+    ret_strides = cgutils.unpack_tuple(builder, ret.strides)
+
+    # Compute the offset by which to bump the destination pointer
+    # after copying each input array.
+    # Morally, we need to copy each input array at different start indices
+    # into the destination array; bumping the destination pointer
+    # is simply easier than offsetting all destination indices.
+    copy_offsets = []
+
+    for arr_sh in arr_shapes:
+        # offset = ret_strides[axis] * input_shape[axis]
+        offset = zero
+        for dim, (size, stride) in enumerate(zip(arr_sh, ret_strides)):
+            is_axis = builder.icmp_signed("==", axis.type(dim), axis)
+            addend = builder.mul(size, stride)
+            offset = builder.select(
+                is_axis, builder.add(offset, addend), offset
+            )
+        copy_offsets.append(offset)
+
+    # Copy input arrays into the return array
+    ret_data = ret.data
+
+    for arrty, arr, arr_sh, arr_st, offset in zip(
+        arrtys, arrs, arr_shapes, arr_strides, copy_offsets
+    ):
+        arr_data = arr.data
+
+        # Do the copy loop
+        # Note the loop nesting is optimized for the destination layout
+        loop_nest = cgutils.loop_nest(
+            builder, arr_sh, cgutils.intp_t, order=retty.layout
+        )
+
+        with loop_nest as indices:
+            src_ptr = cgutils.get_item_pointer2(
+                context,
+                builder,
+                arr_data,
+                arr_sh,
+                arr_st,
+                arrty.layout,
+                indices,
+            )
+            val = load_item(context, builder, arrty, src_ptr)
+            val = context.cast(builder, val, arrty.dtype, retty.dtype)
+            dest_ptr = cgutils.get_item_pointer2(
+                context,
+                builder,
+                ret_data,
+                ret_shapes,
+                ret_strides,
+                retty.layout,
+                indices,
+            )
+            store_item(context, builder, retty, val, dest_ptr)
+
+        # Bump destination pointer
+        ret_data = cgutils.pointer_add(builder, ret_data, offset)
+
+    return ret
+
+
+def _np_concatenate(context, builder, arrtys, arrs, retty, axis):
+    ndim = retty.ndim
+
+    arrs = [
+        make_array(aty)(context, builder, value=a)
+        for aty, a in zip(arrtys, arrs)
+    ]
+
+    axis = _normalize_axis(context, builder, "np.concatenate", ndim, axis)
+
+    # Get input shapes
+    arr_shapes = [cgutils.unpack_tuple(builder, arr.shape) for arr in arrs]
+    arr_strides = [cgutils.unpack_tuple(builder, arr.strides) for arr in arrs]
+
+    # Compute return shape:
+    # - the dimension for the concatenation axis is summed over all inputs
+    # - other dimensions must match exactly for each input
+    ret_shapes = [
+        cgutils.alloca_once_value(builder, sh) for sh in arr_shapes[0]
+    ]
+
+    for dim in range(ndim):
+        is_axis = builder.icmp_signed("==", axis.type(dim), axis)
+        ret_shape_ptr = ret_shapes[dim]
+        ret_sh = builder.load(ret_shape_ptr)
+        other_shapes = [sh[dim] for sh in arr_shapes[1:]]
+
+        with builder.if_else(is_axis) as (on_axis, on_other_dim):
+            with on_axis:
+                sh = functools.reduce(builder.add, other_shapes + [ret_sh])
+                builder.store(sh, ret_shape_ptr)
+
+            with on_other_dim:
+                is_ok = cgutils.true_bit
+                for sh in other_shapes:
+                    is_ok = builder.and_(
+                        is_ok, builder.icmp_signed("==", sh, ret_sh)
+                    )
+                with builder.if_then(builder.not_(is_ok), likely=False):
+                    context.call_conv.return_user_exc(
+                        builder,
+                        ValueError,
+                        (
+                            "np.concatenate(): input sizes over "
+                            "dimension %d do not match" % dim,
+                        ),
+                    )
+
+    ret_shapes = [builder.load(sh) for sh in ret_shapes]
+
+    ret = _do_concatenate(
+        context,
+        builder,
+        axis,
+        arrtys,
+        arrs,
+        arr_shapes,
+        arr_strides,
+        retty,
+        ret_shapes,
+    )
+    return impl_ret_new_ref(context, builder, retty, ret._getvalue())
+
+
+def _np_stack(context, builder, arrtys, arrs, retty, axis):
+    ndim = retty.ndim
+
+    zero = cgutils.intp_t(0)
+    one = cgutils.intp_t(1)
+    ll_narrays = cgutils.intp_t(len(arrs))
+
+    arrs = [
+        make_array(aty)(context, builder, value=a)
+        for aty, a in zip(arrtys, arrs)
+    ]
+
+    axis = _normalize_axis(context, builder, "np.stack", ndim, axis)
+
+    # Check input arrays have the same shape
+    orig_shape = cgutils.unpack_tuple(builder, arrs[0].shape)
+
+    for arr in arrs[1:]:
+        is_ok = cgutils.true_bit
+        for sh, orig_sh in zip(
+            cgutils.unpack_tuple(builder, arr.shape), orig_shape
+        ):
+            is_ok = builder.and_(is_ok, builder.icmp_signed("==", sh, orig_sh))
+            with builder.if_then(builder.not_(is_ok), likely=False):
+                context.call_conv.return_user_exc(
+                    builder,
+                    ValueError,
+                    ("np.stack(): all input arrays must have the same shape",),
+                )
+
+    orig_strides = [cgutils.unpack_tuple(builder, arr.strides) for arr in arrs]
+
+    # Compute input shapes and return shape with the new axis inserted
+    # e.g. given 5 input arrays of shape (2, 3, 4) and axis=1,
+    # corrected input shape is (2, 1, 3, 4) and return shape is (2, 5, 3, 4).
+    ll_shty = ir.ArrayType(cgutils.intp_t, ndim)
+
+    input_shapes = cgutils.alloca_once(builder, ll_shty)
+    ret_shapes = cgutils.alloca_once(builder, ll_shty)
+
+    # 1. copy original sizes at appropriate places
+    for dim in range(ndim - 1):
+        ll_dim = cgutils.intp_t(dim)
+        after_axis = builder.icmp_signed(">=", ll_dim, axis)
+        sh = orig_shape[dim]
+        idx = builder.select(after_axis, builder.add(ll_dim, one), ll_dim)
+        builder.store(sh, cgutils.gep_inbounds(builder, input_shapes, 0, idx))
+        builder.store(sh, cgutils.gep_inbounds(builder, ret_shapes, 0, idx))
+
+    # 2. insert new size at axis dimension
+    builder.store(one, cgutils.gep_inbounds(builder, input_shapes, 0, axis))
+    builder.store(
+        ll_narrays, cgutils.gep_inbounds(builder, ret_shapes, 0, axis)
+    )
+
+    input_shapes = cgutils.unpack_tuple(builder, builder.load(input_shapes))
+    input_shapes = [input_shapes] * len(arrs)
+    ret_shapes = cgutils.unpack_tuple(builder, builder.load(ret_shapes))
+
+    # Compute input strides for each array with the new axis inserted
+    input_strides = [
+        cgutils.alloca_once(builder, ll_shty) for i in range(len(arrs))
+    ]
+
+    # 1. copy original strides at appropriate places
+    for dim in range(ndim - 1):
+        ll_dim = cgutils.intp_t(dim)
+        after_axis = builder.icmp_signed(">=", ll_dim, axis)
+        idx = builder.select(after_axis, builder.add(ll_dim, one), ll_dim)
+        for i in range(len(arrs)):
+            builder.store(
+                orig_strides[i][dim],
+                cgutils.gep_inbounds(builder, input_strides[i], 0, idx),
+            )
+
+    # 2. insert new stride at axis dimension
+    # (the value is indifferent for a 1-sized dimension, we put 0)
+    for i in range(len(arrs)):
+        builder.store(
+            zero, cgutils.gep_inbounds(builder, input_strides[i], 0, axis)
+        )
+
+    input_strides = [
+        cgutils.unpack_tuple(builder, builder.load(st)) for st in input_strides
+    ]
+
+    # Create concatenated array
+    ret = _do_concatenate(
+        context,
+        builder,
+        axis,
+        arrtys,
+        arrs,
+        input_shapes,
+        input_strides,
+        retty,
+        ret_shapes,
+    )
+    return impl_ret_new_ref(context, builder, retty, ret._getvalue())
+
+
+def np_concatenate_typer(typingctx, arrays, axis):
+    if axis is not None and not isinstance(axis, types.Integer):
+        # Note Numpy allows axis=None, but it isn't documented:
+        # https://github.com/numpy/numpy/issues/7968
+        return
+
+    # does type checking
+    dtype, ndim = _sequence_of_arrays(typingctx, "np.concatenate", arrays)
+    if ndim == 0:
+        msg = "zero-dimensional arrays cannot be concatenated"
+        raise errors.NumbaTypeError(msg)
+
+    layout = _choose_concatenation_layout(arrays)
+
+    return types.Array(dtype, ndim, layout)
+
+
+@intrinsic
+def np_concatenate(typingctx, arrays, axis):
+    ret = np_concatenate_typer(typingctx, arrays, axis)
+    assert isinstance(ret, types.Array)
+    sig = ret(arrays, axis)
+
+    def codegen(context, builder, sig, args):
+        axis = context.cast(builder, args[1], sig.args[1], types.intp)
+        return _np_concatenate(
+            context,
+            builder,
+            list(sig.args[0]),
+            cgutils.unpack_tuple(builder, args[0]),
+            sig.return_type,
+            axis,
+        )
+
+    return sig, codegen
+
+
+@overload(np.concatenate)
+def impl_np_concatenate(arrays, axis=0):
+    if isinstance(arrays, types.BaseTuple):
+
+        def impl(arrays, axis=0):
+            return np_concatenate(arrays, axis)
+
+        return impl
+
+
+def _column_stack_dims(context, func_name, arrays):
+    # column_stack() allows stacking 1-d and 2-d arrays together
+    for a in arrays:
+        if a.ndim < 1 or a.ndim > 2:
+            msg = "np.column_stack() is only defined on 1-d and 2-d arrays"
+            raise errors.NumbaTypeError(msg)
+    return 2
+
+
+@intrinsic
+def np_column_stack(typingctx, tup):
+    dtype, ndim = _sequence_of_arrays(
+        typingctx, "np.column_stack", tup, dim_chooser=_column_stack_dims
+    )
+    layout = _choose_concatenation_layout(tup)
+    ret = types.Array(dtype, ndim, layout)
+    sig = ret(tup)
+
+    def codegen(context, builder, sig, args):
+        orig_arrtys = list(sig.args[0])
+        orig_arrs = cgutils.unpack_tuple(builder, args[0])
+
+        arrtys = []
+        arrs = []
+
+        axis = context.get_constant(types.intp, 1)
+
+        for arrty, arr in zip(orig_arrtys, orig_arrs):
+            if arrty.ndim == 2:
+                arrtys.append(arrty)
+                arrs.append(arr)
+            else:
+                # Convert 1d array to 2d column array: np.expand_dims(a, 1)
+                assert arrty.ndim == 1
+                newty = arrty.copy(ndim=2)
+                expand_sig = typing.signature(newty, arrty)
+                newarr = expand_dims(context, builder, expand_sig, (arr,), axis)
+
+                arrtys.append(newty)
+                arrs.append(newarr)
+
+        return _np_concatenate(
+            context, builder, arrtys, arrs, sig.return_type, axis
+        )
+
+    return sig, codegen
+
+
+@overload(np.column_stack)
+def impl_column_stack(tup):
+    if isinstance(tup, types.BaseTuple):
+
+        def impl(tup):
+            return np_column_stack(tup)
+
+        return impl
+
+
+def _np_stack_common(context, builder, sig, args, axis):
+    """
+    np.stack() with the given axis value.
+    """
+    return _np_stack(
+        context,
+        builder,
+        list(sig.args[0]),
+        cgutils.unpack_tuple(builder, args[0]),
+        sig.return_type,
+        axis,
+    )
+
+
+@intrinsic
+def np_stack_common(typingctx, arrays, axis):
+    # does type checking
+    dtype, ndim = _sequence_of_arrays(typingctx, "np.stack", arrays)
+    layout = "F" if all(a.layout == "F" for a in arrays) else "C"
+    ret = types.Array(dtype, ndim + 1, layout)
+    sig = ret(arrays, axis)
+
+    def codegen(context, builder, sig, args):
+        axis = context.cast(builder, args[1], sig.args[1], types.intp)
+        return _np_stack_common(context, builder, sig, args, axis)
+
+    return sig, codegen
+
+
+@overload(np.stack)
+def impl_np_stack(arrays, axis=0):
+    if isinstance(arrays, types.BaseTuple):
+
+        def impl(arrays, axis=0):
+            return np_stack_common(arrays, axis)
+
+        return impl
+
+
+def NdStack_typer(typingctx, func_name, arrays, ndim_min):
+    # does type checking
+    dtype, ndim = _sequence_of_arrays(typingctx, func_name, arrays)
+    ndim = max(ndim, ndim_min)
+    layout = _choose_concatenation_layout(arrays)
+    ret = types.Array(dtype, ndim, layout)
+    return ret
+
+
+@intrinsic
+def _np_hstack(typingctx, tup):
+    ret = NdStack_typer(typingctx, "np.hstack", tup, 1)
+    sig = ret(tup)
+
+    def codegen(context, builder, sig, args):
+        tupty = sig.args[0]
+        ndim = tupty[0].ndim
+
+        if ndim == 0:
+            # hstack() on 0-d arrays returns a 1-d array
+            axis = context.get_constant(types.intp, 0)
+            return _np_stack_common(context, builder, sig, args, axis)
+
+        else:
+            # As a special case, dimension 0 of 1-dimensional arrays
+            # is "horizontal"
+            axis = 0 if ndim == 1 else 1
+
+            def np_hstack_impl(arrays):
+                return np.concatenate(arrays, axis=axis)
+
+            return context.compile_internal(builder, np_hstack_impl, sig, args)
+
+    return sig, codegen
+
+
+@overload(np.hstack)
+def impl_np_hstack(tup):
+    if isinstance(tup, types.BaseTuple):
+
+        def impl(tup):
+            return _np_hstack(tup)
+
+        return impl
+
+
+@intrinsic
+def _np_vstack(typingctx, tup):
+    ret = NdStack_typer(typingctx, "np.vstack", tup, 2)
+    sig = ret(tup)
+
+    def codegen(context, builder, sig, args):
+        tupty = sig.args[0]
+        ndim = tupty[0].ndim
+
+        if ndim == 0:
+
+            def np_vstack_impl(arrays):
+                return np.expand_dims(np.hstack(arrays), 1)
+
+        elif ndim == 1:
+            # np.stack(arrays, axis=0)
+            axis = context.get_constant(types.intp, 0)
+            return _np_stack_common(context, builder, sig, args, axis)
+
+        else:
+
+            def np_vstack_impl(arrays):
+                return np.concatenate(arrays, axis=0)
+
+        return context.compile_internal(builder, np_vstack_impl, sig, args)
+
+    return sig, codegen
+
+
+@overload(np.vstack)
+def impl_np_vstack(tup):
+    if isinstance(tup, types.BaseTuple):
+
+        def impl(tup):
+            return _np_vstack(tup)
+
+        return impl
+
+
+if numpy_version >= (2, 0):
+    overload(np.row_stack)(impl_np_vstack)
+
+
+@intrinsic
+def _np_dstack(typingctx, tup):
+    ret = NdStack_typer(typingctx, "np.dstack", tup, 3)
+    sig = ret(tup)
+
+    def codegen(context, builder, sig, args):
+        tupty = sig.args[0]
+        retty = sig.return_type
+        ndim = tupty[0].ndim
+
+        if ndim == 0:
+
+            def np_vstack_impl(arrays):
+                return np.hstack(arrays).reshape(1, 1, -1)
+
+            return context.compile_internal(builder, np_vstack_impl, sig, args)
+
+        elif ndim == 1:
+            # np.expand_dims(np.stack(arrays, axis=1), axis=0)
+            axis = context.get_constant(types.intp, 1)
+            stack_retty = retty.copy(ndim=retty.ndim - 1)
+            stack_sig = typing.signature(stack_retty, *sig.args)
+            stack_ret = _np_stack_common(
+                context, builder, stack_sig, args, axis
+            )
+
+            axis = context.get_constant(types.intp, 0)
+            expand_sig = typing.signature(retty, stack_retty)
+            return expand_dims(context, builder, expand_sig, (stack_ret,), axis)
+
+        elif ndim == 2:
+            # np.stack(arrays, axis=2)
+            axis = context.get_constant(types.intp, 2)
+            return _np_stack_common(context, builder, sig, args, axis)
+
+        else:
+
+            def np_vstack_impl(arrays):
+                return np.concatenate(arrays, axis=2)
+
+            return context.compile_internal(builder, np_vstack_impl, sig, args)
+
+    return sig, codegen
+
+
+@overload(np.dstack)
+def impl_np_dstack(tup):
+    if isinstance(tup, types.BaseTuple):
+
+        def impl(tup):
+            return _np_dstack(tup)
+
+        return impl
+
+
+@extending.overload_method(types.Array, "fill")
+def arr_fill(arr, val):
+    def fill_impl(arr, val):
+        arr[:] = val
+        return None
+
+    return fill_impl
+
+
+@extending.overload_method(types.Array, "dot")
+def array_dot(arr, other):
+    def dot_impl(arr, other):
+        return np.dot(arr, other)
+
+    return dot_impl
+
+
+@overload(np.fliplr)
+def np_flip_lr(m):
+    if not type_can_asarray(m):
+        raise errors.TypingError("Cannot np.fliplr on %s type" % m)
+
+    def impl(m):
+        A = np.asarray(m)
+        # this handling is superfluous/dead as < 2d array cannot be indexed as
+        # present below and so typing fails. If the typing doesn't fail due to
+        # some future change, this will catch it.
+        if A.ndim < 2:
+            raise ValueError("Input must be >= 2-d.")
+        return A[::, ::-1, ...]
+
+    return impl
+
+
+@overload(np.flipud)
+def np_flip_ud(m):
+    if not type_can_asarray(m):
+        raise errors.TypingError("Cannot np.flipud on %s type" % m)
+
+    def impl(m):
+        A = np.asarray(m)
+        # this handling is superfluous/dead as a 0d array cannot be indexed as
+        # present below and so typing fails. If the typing doesn't fail due to
+        # some future change, this will catch it.
+        if A.ndim < 1:
+            raise ValueError("Input must be >= 1-d.")
+        return A[::-1, ...]
+
+    return impl
+
+
+@intrinsic
+def _build_flip_slice_tuple(tyctx, sz):
+    """Creates a tuple of slices for np.flip indexing like
+    `(slice(None, None, -1),) * sz`"""
+    if not isinstance(sz, types.IntegerLiteral):
+        raise errors.RequireLiteralValue(sz)
+    size = int(sz.literal_value)
+    tuple_type = types.UniTuple(dtype=types.slice3_type, count=size)
+    sig = tuple_type(sz)
+
+    def codegen(context, builder, signature, args):
+        def impl(length, empty_tuple):
+            out = empty_tuple
+            for i in range(length):
+                out = tuple_setitem(out, i, slice(None, None, -1))
+            return out
+
+        inner_argtypes = [types.intp, tuple_type]
+        inner_sig = typing.signature(tuple_type, *inner_argtypes)
+        ll_idx_type = context.get_value_type(types.intp)
+        # Allocate an empty tuple
+        empty_tuple = context.get_constant_undef(tuple_type)
+        inner_args = [ll_idx_type(size), empty_tuple]
+
+        res = context.compile_internal(builder, impl, inner_sig, inner_args)
+        return res
+
+    return sig, codegen
+
+
+@overload(np.flip)
+def np_flip(m):
+    # a constant value is needed for the tuple slice, types.Array.ndim can
+    # provide this and so at presnet only type.Array is support
+    if not isinstance(m, types.Array):
+        raise errors.TypingError("Cannot np.flip on %s type" % m)
+
+    def impl(m):
+        sl = _build_flip_slice_tuple(m.ndim)
+        return m[sl]
+
+    return impl
+
+
+@overload(np.array_split)
+def np_array_split(ary, indices_or_sections, axis=0):
+    if isinstance(ary, (types.UniTuple, types.ListType, types.List)):
+
+        def impl(ary, indices_or_sections, axis=0):
+            return np.array_split(
+                np.asarray(ary), indices_or_sections, axis=axis
+            )
+
+        return impl
+
+    if isinstance(indices_or_sections, types.Integer):
+
+        def impl(ary, indices_or_sections, axis=0):
+            l, rem = divmod(ary.shape[axis], indices_or_sections)
+            indices = np.cumsum(
+                np.array([l + 1] * rem + [l] * (indices_or_sections - rem - 1))
+            )
+            return np.array_split(ary, indices, axis=axis)
+
+        return impl
+
+    elif isinstance(indices_or_sections, types.IterableType) and isinstance(
+        indices_or_sections.iterator_type.yield_type, types.Integer
+    ):
+
+        def impl(ary, indices_or_sections, axis=0):
+            slice_tup = build_full_slice_tuple(ary.ndim)
+            axis = normalize_axis("np.split", "axis", ary.ndim, axis)
+            out = []
+            prev = 0
+            for cur in indices_or_sections:
+                idx = tuple_setitem(slice_tup, axis, slice(prev, cur))
+                out.append(ary[idx])
+                prev = cur
+            out.append(ary[tuple_setitem(slice_tup, axis, slice(cur, None))])
+            return out
+
+        return impl
+
+    elif isinstance(indices_or_sections, types.Tuple) and all(
+        isinstance(t, types.Integer) for t in indices_or_sections.types
+    ):
+
+        def impl(ary, indices_or_sections, axis=0):
+            slice_tup = build_full_slice_tuple(ary.ndim)
+            axis = normalize_axis("np.split", "axis", ary.ndim, axis)
+            out = []
+            prev = 0
+            for cur in literal_unroll(indices_or_sections):
+                idx = tuple_setitem(slice_tup, axis, slice(prev, cur))
+                out.append(ary[idx])
+                prev = cur
+            out.append(ary[tuple_setitem(slice_tup, axis, slice(cur, None))])
+            return out
+
+        return impl
+
+
+@overload(np.split)
+def np_split(ary, indices_or_sections, axis=0):
+    # This is just a wrapper of array_split, but with an extra error if
+    # indices is an int.
+    if isinstance(ary, (types.UniTuple, types.ListType, types.List)):
+
+        def impl(ary, indices_or_sections, axis=0):
+            return np.split(np.asarray(ary), indices_or_sections, axis=axis)
+
+        return impl
+
+    if isinstance(indices_or_sections, types.Integer):
+
+        def impl(ary, indices_or_sections, axis=0):
+            _, rem = divmod(ary.shape[axis], indices_or_sections)
+            if rem != 0:
+                raise ValueError(
+                    "array split does not result in an equal division"
+                )
+            return np.array_split(ary, indices_or_sections, axis=axis)
+
+        return impl
+
+    else:
+        return np_array_split(ary, indices_or_sections, axis=axis)
+
+
+@overload(np.vsplit)
+def numpy_vsplit(ary, indices_or_sections):
+    if not isinstance(ary, types.Array):
+        msg = 'The argument "ary" must be an array'
+        raise errors.TypingError(msg)
+
+    if not isinstance(
+        indices_or_sections,
+        (types.Integer, types.Array, types.List, types.UniTuple),
+    ):
+        msg = 'The argument "indices_or_sections" must be int or 1d-array'
+        raise errors.TypingError(msg)
+
+    def impl(ary, indices_or_sections):
+        if ary.ndim < 2:
+            raise ValueError(
+                ("vsplit only works on arrays of 2 or more dimensions")
+            )
+        return np.split(ary, indices_or_sections, axis=0)
+
+    return impl
+
+
+@overload(np.hsplit)
+def numpy_hsplit(ary, indices_or_sections):
+    if not isinstance(ary, types.Array):
+        msg = 'The argument "ary" must be an array'
+        raise errors.TypingError(msg)
+
+    if not isinstance(
+        indices_or_sections,
+        (types.Integer, types.Array, types.List, types.UniTuple),
+    ):
+        msg = 'The argument "indices_or_sections" must be int or 1d-array'
+        raise errors.TypingError(msg)
+
+    def impl(ary, indices_or_sections):
+        if ary.ndim == 0:
+            raise ValueError(
+                ("hsplit only works on arrays of 1 or more dimensions")
+            )
+        if ary.ndim > 1:
+            return np.split(ary, indices_or_sections, axis=1)
+        return np.split(ary, indices_or_sections, axis=0)
+
+    return impl
+
+
+@overload(np.dsplit)
+def numpy_dsplit(ary, indices_or_sections):
+    if not isinstance(ary, types.Array):
+        msg = 'The argument "ary" must be an array'
+        raise errors.TypingError(msg)
+
+    if not isinstance(
+        indices_or_sections,
+        (types.Integer, types.Array, types.List, types.UniTuple),
+    ):
+        msg = 'The argument "indices_or_sections" must be int or 1d-array'
+        raise errors.TypingError(msg)
+
+    def impl(ary, indices_or_sections):
+        if ary.ndim < 3:
+            raise ValueError(
+                "dsplit only works on arrays of 3 or more dimensions"
+            )
+        return np.split(ary, indices_or_sections, axis=2)
+
+    return impl
+
+
+# -----------------------------------------------------------------------------
+# Sorting
+
+_sorts = {}
+
+
+def default_lt(a, b):
+    """
+    Trivial comparison function between two keys.
+    """
+    return a < b
+
+
+def get_sort_func(kind, lt_impl, is_argsort=False):
+    """
+    Get a sort implementation of the given kind.
+    """
+    key = kind, lt_impl.__name__, is_argsort
+
+    try:
+        return _sorts[key]
+    except KeyError:
+        if kind == "quicksort":
+            sort = quicksort.make_jit_quicksort(
+                lt=lt_impl, is_argsort=is_argsort, is_np_array=True
+            )
+            func = sort.run_quicksort
+        elif kind == "mergesort":
+            sort = mergesort.make_jit_mergesort(
+                lt=lt_impl, is_argsort=is_argsort
+            )
+            func = sort.run_mergesort
+        _sorts[key] = func
+        return func
+
+
+def lt_implementation(dtype):
+    if isinstance(dtype, types.Float):
+        return lt_floats
+    elif isinstance(dtype, types.Complex):
+        return lt_complex
+    else:
+        return default_lt
+
+
+@lower_builtin("array.sort", types.Array)
+def array_sort(context, builder, sig, args):
+    arytype = sig.args[0]
+
+    sort_func = get_sort_func(
+        kind="quicksort", lt_impl=lt_implementation(arytype.dtype)
+    )
+
+    def array_sort_impl(arr):
+        # Note we clobber the return value
+        sort_func(arr)
+
+    return context.compile_internal(builder, array_sort_impl, sig, args)
+
+
+@overload(np.sort)
+def impl_np_sort(a):
+    if not type_can_asarray(a):
+        raise errors.TypingError('Argument "a" must be array-like')
+
+    def np_sort_impl(a):
+        res = a.copy()
+        res.sort()
+        return res
+
+    return np_sort_impl
+
+
+@lower_builtin("array.argsort", types.Array, types.StringLiteral)
+@lower_builtin(np.argsort, types.Array, types.StringLiteral)
+def array_argsort(context, builder, sig, args):
+    arytype, kind = sig.args
+
+    sort_func = get_sort_func(
+        kind=kind.literal_value,
+        lt_impl=lt_implementation(arytype.dtype),
+        is_argsort=True,
+    )
+
+    def array_argsort_impl(arr):
+        return sort_func(arr)
+
+    innersig = sig.replace(args=sig.args[:1])
+    innerargs = args[:1]
+    return context.compile_internal(
+        builder, array_argsort_impl, innersig, innerargs
+    )
+
+
+# ------------------------------------------------------------------------------
+# Implicit cast
+
+
+@lower_cast(types.Array, types.Array)
+def array_to_array(context, builder, fromty, toty, val):
+    # Type inference should have prevented illegal array casting.
+    assert fromty.mutable != toty.mutable or toty.layout == "A"
+    return val
+
+
+@lower_cast(types.Array, types.UnicodeCharSeq)
+@lower_cast(types.Array, types.Float)
+@lower_cast(types.Array, types.Integer)
+@lower_cast(types.Array, types.Complex)
+@lower_cast(types.Array, types.Boolean)
+@lower_cast(types.Array, types.NPTimedelta)
+@lower_cast(types.Array, types.NPDatetime)
+def array0d_to_scalar(context, builder, fromty, toty, val):
+    def impl(a):
+        # a is an array(T, 0d, O), T is type, O is order
+        return a.take(0)
+
+    sig = signature(toty, fromty)
+    res = context.compile_internal(builder, impl, sig, [val])
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+@lower_cast(types.Array, types.UnicodeCharSeq)
+def array_to_unichrseq(context, builder, fromty, toty, val):
+    def impl(a):
+        return str(a[()])
+
+    sig = signature(toty, fromty)
+    res = context.compile_internal(builder, impl, sig, [val])
+    return impl_ret_borrowed(context, builder, sig.return_type, res)
+
+
+# ------------------------------------------------------------------------------
+# Stride tricks
+
+
+def reshape_unchecked(a, shape, strides):
+    """
+    An intrinsic returning a derived array with the given shape and strides.
+    """
+    raise NotImplementedError
+
+
+@extending.type_callable(reshape_unchecked)
+def type_reshape_unchecked(context):
+    def check_shape(shape):
+        return isinstance(shape, types.BaseTuple) and all(
+            isinstance(v, types.Integer) for v in shape
+        )
+
+    def typer(a, shape, strides):
+        if not isinstance(a, types.Array):
+            return
+        if not check_shape(shape) or not check_shape(strides):
+            return
+        if len(shape) != len(strides):
+            return
+        return a.copy(ndim=len(shape), layout="A")
+
+    return typer
+
+
+@lower_builtin(reshape_unchecked, types.Array, types.BaseTuple, types.BaseTuple)
+def impl_shape_unchecked(context, builder, sig, args):
+    aryty = sig.args[0]
+    retty = sig.return_type
+
+    ary = make_array(aryty)(context, builder, args[0])
+    out = make_array(retty)(context, builder)
+    shape = cgutils.unpack_tuple(builder, args[1])
+    strides = cgutils.unpack_tuple(builder, args[2])
+
+    populate_array(
+        out,
+        data=ary.data,
+        shape=shape,
+        strides=strides,
+        itemsize=ary.itemsize,
+        meminfo=ary.meminfo,
+    )
+
+    res = out._getvalue()
+    return impl_ret_borrowed(context, builder, retty, res)
+
+
+@extending.overload(np.lib.stride_tricks.as_strided)
+def as_strided(x, shape=None, strides=None):
+    if shape in (None, types.none):
+
+        @register_jitable
+        def get_shape(x, shape):
+            return x.shape
+    else:
+
+        @register_jitable
+        def get_shape(x, shape):
+            return shape
+
+    if strides in (None, types.none):
+        # When *strides* is not passed, as_strided() does a non-size-checking
+        # reshape(), possibly changing the original strides.  This is too
+        # cumbersome to support right now, and a Web search shows all example
+        # use cases of as_strided() pass explicit *strides*.
+        raise errors.TypingError("as_strided() strides argument cannot be None")
+    else:
+
+        @register_jitable
+        def get_strides(x, strides):
+            return strides
+
+    def as_strided_impl(x, shape=None, strides=None):
+        x = reshape_unchecked(x, get_shape(x, shape), get_strides(x, strides))
+        return x
+
+    return as_strided_impl
+
+
+@extending.overload(np.lib.stride_tricks.sliding_window_view)
+def sliding_window_view(x, window_shape, axis=None):
+    # Window shape must be given as either an integer or tuple of integers.
+    # We also need to generate buffer tuples we can modify to contain the
+    # final shape and strides (reshape_unchecked does not accept lists).
+    if isinstance(window_shape, types.Integer):
+        shape_buffer = tuple(range(x.ndim + 1))
+        stride_buffer = tuple(range(x.ndim + 1))
+
+        @register_jitable
+        def get_window_shape(window_shape):
+            return (window_shape,)
+
+    elif isinstance(window_shape, types.UniTuple) and isinstance(
+        window_shape.dtype, types.Integer
+    ):
+        shape_buffer = tuple(range(x.ndim + len(window_shape)))
+        stride_buffer = tuple(range(x.ndim + len(window_shape)))
+
+        @register_jitable
+        def get_window_shape(window_shape):
+            return window_shape
+
+    else:
+        raise errors.TypingError(
+            "window_shape must be an integer or tuple of integers"
+        )
+
+    # Axis must be integer, tuple of integers, or None for all axes.
+    if is_nonelike(axis):
+
+        @register_jitable
+        def get_axis(window_shape, axis, ndim):
+            return list(range(ndim))
+
+    elif isinstance(axis, types.Integer):
+
+        @register_jitable
+        def get_axis(window_shape, axis, ndim):
+            return [normalize_axis("sliding_window_view", "axis", ndim, axis)]
+
+    elif isinstance(axis, types.UniTuple) and isinstance(
+        axis.dtype, types.Integer
+    ):
+
+        @register_jitable
+        def get_axis(window_shape, axis, ndim):
+            return [
+                normalize_axis("sliding_window_view", "axis", ndim, a)
+                for a in axis
+            ]
+
+    else:
+        raise errors.TypingError(
+            "axis must be None, an integer or tuple of integers"
+        )
+
+    def sliding_window_view_impl(x, window_shape, axis=None):
+        window_shape = get_window_shape(window_shape)
+        axis = get_axis(window_shape, axis, x.ndim)
+        if len(window_shape) != len(axis):
+            raise ValueError(
+                "Must provide matching length window_shape and axis"
+            )
+
+        # Initialise view details with shape and strides of x.
+        out_shape = shape_buffer
+        out_strides = stride_buffer
+        for i in range(x.ndim):
+            out_shape = tuple_setitem(out_shape, i, x.shape[i])
+            out_strides = tuple_setitem(out_strides, i, x.strides[i])
+
+        # Trim the dimensions being windowed and set the window shape and
+        # strides. Note: the same axis can be windowed repeatedly.
+        i = x.ndim
+        for ax, dim in zip(axis, window_shape):
+            if dim < 0:
+                raise ValueError(
+                    "`window_shape` cannot contain negative values"
+                )
+            if out_shape[ax] < dim:
+                raise ValueError(
+                    "window_shape cannot be larger than input array shape"
+                )
+
+            trimmed = out_shape[ax] - dim + 1
+            out_shape = tuple_setitem(out_shape, ax, trimmed)
+            out_shape = tuple_setitem(out_shape, i, dim)
+            out_strides = tuple_setitem(out_strides, i, x.strides[ax])
+            i += 1
+
+        # The NumPy version calls as_strided, but our implementation of
+        # as_strided is effectively a wrapper for reshape_unchecked.
+        view = reshape_unchecked(x, out_shape, out_strides)
+        return view
+
+    return sliding_window_view_impl
+
+
+@overload(bool)
+def ol_bool(arr):
+    if isinstance(arr, types.Array):
+
+        def impl(arr):
+            if arr.size == 0:
+                if numpy_version < (2, 2):
+                    return False  # this is deprecated
+                else:
+                    raise ValueError(
+                        (
+                            "The truth value of an empty array is "
+                            "ambiguous. Use `array.size > 0` to "
+                            "check that an array is not empty."
+                        )
+                    )
+            elif arr.size == 1:
+                return bool(arr.take(0))
+            else:
+                raise ValueError(
+                    (
+                        "The truth value of an array with more than"
+                        " one element is ambiguous. Use a.any() or"
+                        " a.all()"
+                    )
+                )
+
+        return impl
+
+
+@overload(np.swapaxes)
+def numpy_swapaxes(a, axis1, axis2):
+    if not isinstance(axis1, (int, types.Integer)):
+        raise errors.TypingError(
+            'The second argument "axis1" must be an integer'
+        )
+    if not isinstance(axis2, (int, types.Integer)):
+        raise errors.TypingError(
+            'The third argument "axis2" must be an integer'
+        )
+    if not isinstance(a, types.Array):
+        raise errors.TypingError('The first argument "a" must be an array')
+
+    # create tuple list for transpose
+    ndim = a.ndim
+    axes_list = tuple(range(ndim))
+
+    def impl(a, axis1, axis2):
+        axis1 = normalize_axis("np.swapaxes", "axis1", ndim, axis1)
+        axis2 = normalize_axis("np.swapaxes", "axis2", ndim, axis2)
+
+        # to ensure tuple_setitem support of negative values
+        if axis1 < 0:
+            axis1 += ndim
+        if axis2 < 0:
+            axis2 += ndim
+
+        axes_tuple = tuple_setitem(axes_list, axis1, axis2)
+        axes_tuple = tuple_setitem(axes_tuple, axis2, axis1)
+        return np.transpose(a, axes_tuple)
+
+    return impl
+
+
+@register_jitable
+def _take_along_axis_impl(
+    arr, indices, axis, Ni_orig, Nk_orig, indices_broadcast_shape
+):
+    # Based on example code in
+    # https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/lib/shape_base.py#L90-L103
+    # With addition of pre-broadcasting:
+    # https://github.com/numpy/numpy/issues/19704
+
+    # Wrap axis, it's used in tuple_setitem so must be (axis >= 0) to ensure
+    # the GEP is in bounds.
+    axis = normalize_axis("np.take_along_axis", "axis", arr.ndim, axis)
+
+    # Broadcast the two arrays to matching shapes:
+    arr_shape = list(arr.shape)
+    arr_shape[axis] = 1
+    for i, (d1, d2) in enumerate(zip(arr_shape, indices.shape)):
+        if d1 == 1:
+            new_val = d2
+        elif d2 == 1:
+            new_val = d1
+        else:
+            if d1 != d2:
+                raise ValueError("`arr` and `indices` dimensions don't match")
+            new_val = d1
+        indices_broadcast_shape = tuple_setitem(
+            indices_broadcast_shape, i, new_val
+        )
+    arr_broadcast_shape = tuple_setitem(
+        indices_broadcast_shape, axis, arr.shape[axis]
+    )
+    arr = np.broadcast_to(arr, arr_broadcast_shape)
+    indices = np.broadcast_to(indices, indices_broadcast_shape)
+
+    Ni = Ni_orig
+    if len(Ni_orig) > 0:
+        for i in range(len(Ni)):
+            Ni = tuple_setitem(Ni, i, arr.shape[i])
+    Nk = Nk_orig
+    if len(Nk_orig) > 0:
+        for i in range(len(Nk)):
+            Nk = tuple_setitem(Nk, i, arr.shape[axis + 1 + i])
+
+    J = indices.shape[axis]  # Need not equal M
+    out = np.empty(Ni + (J,) + Nk, arr.dtype)
+
+    np_s_ = (slice(None, None, None),)
+
+    for ii in np.ndindex(Ni):
+        for kk in np.ndindex(Nk):
+            a_1d = arr[ii + np_s_ + kk]
+            indices_1d = indices[ii + np_s_ + kk]
+            out_1d = out[ii + np_s_ + kk]
+            for j in range(J):
+                out_1d[j] = a_1d[indices_1d[j]]
+    return out
+
+
+@overload(np.take_along_axis)
+def arr_take_along_axis(arr, indices, axis):
+    if not isinstance(arr, types.Array):
+        raise errors.TypingError('The first argument "arr" must be an array')
+    if not isinstance(indices, types.Array):
+        raise errors.TypingError(
+            'The second argument "indices" must be an array'
+        )
+    if not isinstance(indices.dtype, types.Integer):
+        raise errors.TypingError("The indices array must contain integers")
+    if is_nonelike(axis):
+        arr_ndim = 1
+    else:
+        arr_ndim = arr.ndim
+    if arr_ndim != indices.ndim:
+        # Matches NumPy error:
+        raise errors.TypingError(
+            "`indices` and `arr` must have the same number of dimensions"
+        )
+
+    indices_broadcast_shape = tuple(range(indices.ndim))
+    if is_nonelike(axis):
+
+        def take_along_axis_impl(arr, indices, axis):
+            return _take_along_axis_impl(
+                arr.flatten(), indices, 0, (), (), indices_broadcast_shape
+            )
+    else:
+        check_is_integer(axis, "axis")
+        if not isinstance(axis, types.IntegerLiteral):
+            raise errors.NumbaValueError("axis must be a literal value")
+        axis = axis.literal_value
+        if axis < 0:
+            axis = arr.ndim + axis
+
+        if axis < 0 or axis >= arr.ndim:
+            raise errors.NumbaValueError("axis is out of bounds")
+
+        Ni = tuple(range(axis))
+        Nk = tuple(range(axis + 1, arr.ndim))
+
+        def take_along_axis_impl(arr, indices, axis):
+            return _take_along_axis_impl(
+                arr, indices, axis, Ni, Nk, indices_broadcast_shape
+            )
+
+    return take_along_axis_impl
+
+
+@overload(np.nan_to_num)
+def nan_to_num_impl(x, copy=True, nan=0.0):
+    if isinstance(x, types.Number):
+        if isinstance(x, types.Integer):
+            # Integers do not have nans or infs
+            def impl(x, copy=True, nan=0.0):
+                return x
+
+        elif isinstance(x, types.Float):
+
+            def impl(x, copy=True, nan=0.0):
+                if np.isnan(x):
+                    return nan
+                elif np.isneginf(x):
+                    return np.finfo(type(x)).min
+                elif np.isposinf(x):
+                    return np.finfo(type(x)).max
+                return x
+        elif isinstance(x, types.Complex):
+
+            def impl(x, copy=True, nan=0.0):
+                r = np.nan_to_num(x.real, nan=nan)
+                c = np.nan_to_num(x.imag, nan=nan)
+                return complex(r, c)
+        else:
+            raise errors.TypingError(
+                "Only Integer, Float, and Complex values are accepted"
+            )
+
+    elif type_can_asarray(x):
+        if isinstance(x.dtype, types.Integer):
+            # Integers do not have nans or infs
+            def impl(x, copy=True, nan=0.0):
+                return x
+        elif isinstance(x.dtype, types.Float):
+
+            def impl(x, copy=True, nan=0.0):
+                min_inf = np.finfo(x.dtype).min
+                max_inf = np.finfo(x.dtype).max
+
+                x_ = np.asarray(x)
+                output = np.copy(x_) if copy else x_
+
+                output_flat = output.flat
+                for i in range(output.size):
+                    if np.isnan(output_flat[i]):
+                        output_flat[i] = nan
+                    elif np.isneginf(output_flat[i]):
+                        output_flat[i] = min_inf
+                    elif np.isposinf(output_flat[i]):
+                        output_flat[i] = max_inf
+                return output
+        elif isinstance(x.dtype, types.Complex):
+
+            def impl(x, copy=True, nan=0.0):
+                x_ = np.asarray(x)
+                output = np.copy(x_) if copy else x_
+
+                np.nan_to_num(output.real, copy=False, nan=nan)
+                np.nan_to_num(output.imag, copy=False, nan=nan)
+                return output
+        else:
+            raise errors.TypingError(
+                "Only Integer, Float, and Complex values are accepted"
+            )
+    else:
+        raise errors.TypingError(
+            "The first argument must be a scalar or an array-like"
+        )
+    return impl
diff --git a/numba_cuda/numba/cuda/np/math/__init__.py b/numba_cuda/numba/cuda/np/math/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/numba_cuda/numba/cuda/np/math/cmathimpl.py b/numba_cuda/numba/cuda/np/math/cmathimpl.py
new file mode 100644
index 000000000..86b4bb01e
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/math/cmathimpl.py
@@ -0,0 +1,558 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Implement the cmath module functions.
+"""
+
+import cmath
+import math
+
+from numba.core.imputils import impl_ret_untracked
+from numba.core import types
+from numba.cuda.typing import signature
+from numba.cuda.cpython import mathimpl
+
+# registry = Registry('cmathimpl')
+# lower = registry.lower
+
+
+def is_nan(builder, z):
+    return builder.fcmp_unordered("uno", z.real, z.imag)
+
+
+def is_inf(builder, z):
+    return builder.or_(
+        mathimpl.is_inf(builder, z.real), mathimpl.is_inf(builder, z.imag)
+    )
+
+
+def is_finite(builder, z):
+    return builder.and_(
+        mathimpl.is_finite(builder, z.real), mathimpl.is_finite(builder, z.imag)
+    )
+
+
+# @lower(cmath.isnan, types.Complex)
+def isnan_float_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [value] = args
+    z = context.make_complex(builder, typ, value=value)
+    res = is_nan(builder, z)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(cmath.isinf, types.Complex)
+def isinf_float_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [value] = args
+    z = context.make_complex(builder, typ, value=value)
+    res = is_inf(builder, z)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(cmath.isfinite, types.Complex)
+def isfinite_float_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [value] = args
+    z = context.make_complex(builder, typ, value=value)
+    res = is_finite(builder, z)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @overload(cmath.rect)
+def impl_cmath_rect(r, phi):
+    if all([isinstance(typ, types.Float) for typ in [r, phi]]):
+
+        def impl(r, phi):
+            if not math.isfinite(phi):
+                if not r:
+                    # cmath.rect(0, phi={inf, nan}) = 0
+                    return abs(r)
+                if math.isinf(r):
+                    # cmath.rect(inf, phi={inf, nan}) = inf + j phi
+                    return complex(r, phi)
+            real = math.cos(phi)
+            imag = math.sin(phi)
+            if real == 0.0 and math.isinf(r):
+                # 0 * inf would return NaN, we want to keep 0 but xor the sign
+                real /= r
+            else:
+                real *= r
+            if imag == 0.0 and math.isinf(r):
+                # ditto
+                imag /= r
+            else:
+                imag *= r
+            return complex(real, imag)
+
+        return impl
+
+
+def intrinsic_complex_unary(inner_func):
+    def wrapper(context, builder, sig, args):
+        [typ] = sig.args
+        [value] = args
+        z = context.make_complex(builder, typ, value=value)
+        x = z.real
+        y = z.imag
+        # Same as above: math.isfinite() is unavailable on 2.x so we precompute
+        # its value and pass it to the pure Python implementation.
+        x_is_finite = mathimpl.is_finite(builder, x)
+        y_is_finite = mathimpl.is_finite(builder, y)
+        inner_sig = signature(
+            sig.return_type, *(typ.underlying_float,) * 2 + (types.boolean,) * 2
+        )
+        res = context.compile_internal(
+            builder, inner_func, inner_sig, (x, y, x_is_finite, y_is_finite)
+        )
+        return impl_ret_untracked(context, builder, sig, res)
+
+    return wrapper
+
+
+NAN = float("nan")
+INF = float("inf")
+
+
+# @lower(cmath.exp, types.Complex)
+@intrinsic_complex_unary
+def exp_impl(x, y, x_is_finite, y_is_finite):
+    """cmath.exp(x + y j)"""
+    if x_is_finite:
+        if y_is_finite:
+            c = math.cos(y)
+            s = math.sin(y)
+            r = math.exp(x)
+            return complex(r * c, r * s)
+        else:
+            return complex(NAN, NAN)
+    elif math.isnan(x):
+        if y:
+            return complex(x, x)  # nan + j nan
+        else:
+            return complex(x, y)  # nan + 0j
+    elif x > 0.0:
+        # x == +inf
+        if y_is_finite:
+            real = math.cos(y)
+            imag = math.sin(y)
+            # Avoid NaNs if math.cos(y) or math.sin(y) == 0
+            # (e.g. cmath.exp(inf + 0j) == inf + 0j)
+            if real != 0:
+                real *= x
+            if imag != 0:
+                imag *= x
+            return complex(real, imag)
+        else:
+            return complex(x, NAN)
+    else:
+        # x == -inf
+        if y_is_finite:
+            r = math.exp(x)
+            c = math.cos(y)
+            s = math.sin(y)
+            return complex(r * c, r * s)
+        else:
+            r = 0
+            return complex(r, r)
+
+
+# @lower(cmath.log, types.Complex)
+@intrinsic_complex_unary
+def log_impl(x, y, x_is_finite, y_is_finite):
+    """cmath.log(x + y j)"""
+    a = math.log(math.hypot(x, y))
+    b = math.atan2(y, x)
+    return complex(a, b)
+
+
+# @lower(cmath.log, types.Complex, types.Complex)
+def log_base_impl(context, builder, sig, args):
+    """cmath.log(z, base)"""
+    [z, base] = args
+
+    def log_base(z, base):
+        return cmath.log(z) / cmath.log(base)
+
+    res = context.compile_internal(builder, log_base, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @overload(cmath.log10)
+def impl_cmath_log10(z):
+    if not isinstance(z, types.Complex):
+        return
+
+    LN_10 = 2.302585092994045684
+
+    def log10_impl(z):
+        """cmath.log10(z)"""
+        z = cmath.log(z)
+        # This formula gives better results on +/-inf than cmath.log(z, 10)
+        # See http://bugs.python.org/issue22544
+        return complex(z.real / LN_10, z.imag / LN_10)
+
+    return log10_impl
+
+
+# @overload(cmath.phase)
+def phase_impl(x):
+    """cmath.phase(x + y j)"""
+
+    if not isinstance(x, types.Complex):
+        return
+
+    def impl(x):
+        return math.atan2(x.imag, x.real)
+
+    return impl
+
+
+# @overload(cmath.polar)
+def polar_impl(x):
+    if not isinstance(x, types.Complex):
+        return
+
+    def impl(x):
+        r, i = x.real, x.imag
+        return math.hypot(r, i), math.atan2(i, r)
+
+    return impl
+
+
+# @lower(cmath.sqrt, types.Complex)
+def sqrt_impl(context, builder, sig, args):
+    # We risk spurious overflow for components >= FLT_MAX / (1 + sqrt(2)).
+
+    SQRT2 = 1.414213562373095048801688724209698079e0
+    ONE_PLUS_SQRT2 = 1.0 + SQRT2
+    theargflt = sig.args[0].underlying_float
+    # Get a type specific maximum value so scaling for overflow is based on that
+    MAX = mathimpl.DBL_MAX if theargflt.bitwidth == 64 else mathimpl.FLT_MAX
+    # THRES will be double precision, should not impact typing as it's just
+    # used for comparison, there *may* be a few values near THRES which
+    # deviate from e.g. NumPy due to rounding that occurs in the computation
+    # of this value in the case of a 32bit argument.
+    THRES = MAX / ONE_PLUS_SQRT2
+
+    def sqrt_impl(z):
+        """cmath.sqrt(z)"""
+        # This is NumPy's algorithm, see npy_csqrt() in npy_math_complex.c.src
+        a = z.real
+        b = z.imag
+        if a == 0.0 and b == 0.0:
+            return complex(abs(b), b)
+        if math.isinf(b):
+            return complex(abs(b), b)
+        if math.isnan(a):
+            return complex(a, a)
+        if math.isinf(a):
+            if a < 0.0:
+                return complex(abs(b - b), math.copysign(a, b))
+            else:
+                return complex(a, math.copysign(b - b, b))
+
+        # The remaining special case (b is NaN) is handled just fine by
+        # the normal code path below.
+
+        # Scale to avoid overflow
+        if abs(a) >= THRES or abs(b) >= THRES:
+            a *= 0.25
+            b *= 0.25
+            scale = True
+        else:
+            scale = False
+        # Algorithm 312, CACM vol 10, Oct 1967
+        if a >= 0:
+            t = math.sqrt((a + math.hypot(a, b)) * 0.5)
+            real = t
+            imag = b / (2 * t)
+        else:
+            t = math.sqrt((-a + math.hypot(a, b)) * 0.5)
+            real = abs(b) / (2 * t)
+            imag = math.copysign(t, b)
+        # Rescale
+        if scale:
+            return complex(real * 2, imag)
+        else:
+            return complex(real, imag)
+
+    res = context.compile_internal(builder, sqrt_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @lower(cmath.cos, types.Complex)
+def cos_impl(context, builder, sig, args):
+    def cos_impl(z):
+        """cmath.cos(z) = cmath.cosh(z j)"""
+        return cmath.cosh(complex(-z.imag, z.real))
+
+    res = context.compile_internal(builder, cos_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @overload(cmath.cosh)
+def impl_cmath_cosh(z):
+    if not isinstance(z, types.Complex):
+        return
+
+    def cosh_impl(z):
+        """cmath.cosh(z)"""
+        x = z.real
+        y = z.imag
+        if math.isinf(x):
+            if math.isnan(y):
+                # x = +inf, y = NaN => cmath.cosh(x + y j) = inf + Nan * j
+                real = abs(x)
+                imag = y
+            elif y == 0.0:
+                # x = +inf, y = 0 => cmath.cosh(x + y j) = inf + 0j
+                real = abs(x)
+                imag = y
+            else:
+                real = math.copysign(x, math.cos(y))
+                imag = math.copysign(x, math.sin(y))
+            if x < 0.0:
+                # x = -inf => negate imaginary part of result
+                imag = -imag
+            return complex(real, imag)
+        return complex(math.cos(y) * math.cosh(x), math.sin(y) * math.sinh(x))
+
+    return cosh_impl
+
+
+# @lower(cmath.sin, types.Complex)
+def sin_impl(context, builder, sig, args):
+    def sin_impl(z):
+        """cmath.sin(z) = -j * cmath.sinh(z j)"""
+        r = cmath.sinh(complex(-z.imag, z.real))
+        return complex(r.imag, -r.real)
+
+    res = context.compile_internal(builder, sin_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @overload(cmath.sinh)
+def impl_cmath_sinh(z):
+    if not isinstance(z, types.Complex):
+        return
+
+    def sinh_impl(z):
+        """cmath.sinh(z)"""
+        x = z.real
+        y = z.imag
+        if math.isinf(x):
+            if math.isnan(y):
+                # x = +/-inf, y = NaN => cmath.sinh(x + y j) = x + NaN * j
+                real = x
+                imag = y
+            else:
+                real = math.cos(y)
+                imag = math.sin(y)
+                if real != 0.0:
+                    real *= x
+                if imag != 0.0:
+                    imag *= abs(x)
+            return complex(real, imag)
+        return complex(math.cos(y) * math.sinh(x), math.sin(y) * math.cosh(x))
+
+    return sinh_impl
+
+
+# @lower(cmath.tan, types.Complex)
+def tan_impl(context, builder, sig, args):
+    def tan_impl(z):
+        """cmath.tan(z) = -j * cmath.tanh(z j)"""
+        r = cmath.tanh(complex(-z.imag, z.real))
+        return complex(r.imag, -r.real)
+
+    res = context.compile_internal(builder, tan_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @overload(cmath.tanh)
+def impl_cmath_tanh(z):
+    if not isinstance(z, types.Complex):
+        return
+
+    def tanh_impl(z):
+        """cmath.tanh(z)"""
+        x = z.real
+        y = z.imag
+        if math.isinf(x):
+            real = math.copysign(1.0, x)
+            if math.isinf(y):
+                imag = 0.0
+            else:
+                imag = math.copysign(0.0, math.sin(2.0 * y))
+            return complex(real, imag)
+        # This is CPython's algorithm (see c_tanh() in cmathmodule.c).
+        # XXX how to force float constants into single precision?
+        tx = math.tanh(x)
+        ty = math.tan(y)
+        cx = 1.0 / math.cosh(x)
+        txty = tx * ty
+        denom = 1.0 + txty * txty
+        return complex(tx * (1.0 + ty * ty) / denom, ((ty / denom) * cx) * cx)
+
+    return tanh_impl
+
+
+# @lower(cmath.acos, types.Complex)
+def acos_impl(context, builder, sig, args):
+    LN_4 = math.log(4)
+    THRES = mathimpl.FLT_MAX / 4
+
+    def acos_impl(z):
+        """cmath.acos(z)"""
+        # CPython's algorithm (see c_acos() in cmathmodule.c)
+        if abs(z.real) > THRES or abs(z.imag) > THRES:
+            # Avoid unnecessary overflow for large arguments
+            # (also handles infinities gracefully)
+            real = math.atan2(abs(z.imag), z.real)
+            imag = math.copysign(
+                math.log(math.hypot(z.real * 0.5, z.imag * 0.5)) + LN_4, -z.imag
+            )
+            return complex(real, imag)
+        else:
+            s1 = cmath.sqrt(complex(1.0 - z.real, -z.imag))
+            s2 = cmath.sqrt(complex(1.0 + z.real, z.imag))
+            real = 2.0 * math.atan2(s1.real, s2.real)
+            imag = math.asinh(s2.real * s1.imag - s2.imag * s1.real)
+            return complex(real, imag)
+
+    res = context.compile_internal(builder, acos_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @overload(cmath.acosh)
+def impl_cmath_acosh(z):
+    if not isinstance(z, types.Complex):
+        return
+
+    LN_4 = math.log(4)
+    THRES = mathimpl.FLT_MAX / 4
+
+    def acosh_impl(z):
+        """cmath.acosh(z)"""
+        # CPython's algorithm (see c_acosh() in cmathmodule.c)
+        if abs(z.real) > THRES or abs(z.imag) > THRES:
+            # Avoid unnecessary overflow for large arguments
+            # (also handles infinities gracefully)
+            real = math.log(math.hypot(z.real * 0.5, z.imag * 0.5)) + LN_4
+            imag = math.atan2(z.imag, z.real)
+            return complex(real, imag)
+        else:
+            s1 = cmath.sqrt(complex(z.real - 1.0, z.imag))
+            s2 = cmath.sqrt(complex(z.real + 1.0, z.imag))
+            real = math.asinh(s1.real * s2.real + s1.imag * s2.imag)
+            imag = 2.0 * math.atan2(s1.imag, s2.real)
+            return complex(real, imag)
+        # Condensed formula (NumPy)
+        # return cmath.log(z + cmath.sqrt(z + 1.) * cmath.sqrt(z - 1.))
+
+    return acosh_impl
+
+
+# @lower(cmath.asinh, types.Complex)
+def asinh_impl(context, builder, sig, args):
+    LN_4 = math.log(4)
+    THRES = mathimpl.FLT_MAX / 4
+
+    def asinh_impl(z):
+        """cmath.asinh(z)"""
+        # CPython's algorithm (see c_asinh() in cmathmodule.c)
+        if abs(z.real) > THRES or abs(z.imag) > THRES:
+            real = math.copysign(
+                math.log(math.hypot(z.real * 0.5, z.imag * 0.5)) + LN_4, z.real
+            )
+            imag = math.atan2(z.imag, abs(z.real))
+            return complex(real, imag)
+        else:
+            s1 = cmath.sqrt(complex(1.0 + z.imag, -z.real))
+            s2 = cmath.sqrt(complex(1.0 - z.imag, z.real))
+            real = math.asinh(s1.real * s2.imag - s2.real * s1.imag)
+            imag = math.atan2(z.imag, s1.real * s2.real - s1.imag * s2.imag)
+            return complex(real, imag)
+
+    res = context.compile_internal(builder, asinh_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @lower(cmath.asin, types.Complex)
+def asin_impl(context, builder, sig, args):
+    def asin_impl(z):
+        """cmath.asin(z) = -j * cmath.asinh(z j)"""
+        r = cmath.asinh(complex(-z.imag, z.real))
+        return complex(r.imag, -r.real)
+
+    res = context.compile_internal(builder, asin_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @lower(cmath.atan, types.Complex)
+def atan_impl(context, builder, sig, args):
+    def atan_impl(z):
+        """cmath.atan(z) = -j * cmath.atanh(z j)"""
+        r = cmath.atanh(complex(-z.imag, z.real))
+        if math.isinf(z.real) and math.isnan(z.imag):
+            # XXX this is odd but necessary
+            return complex(r.imag, r.real)
+        else:
+            return complex(r.imag, -r.real)
+
+    res = context.compile_internal(builder, atan_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
+
+
+# @lower(cmath.atanh, types.Complex)
+def atanh_impl(context, builder, sig, args):
+    THRES_LARGE = math.sqrt(mathimpl.FLT_MAX / 4)
+    THRES_SMALL = math.sqrt(mathimpl.FLT_MIN)
+    PI_12 = math.pi / 2
+
+    def atanh_impl(z):
+        """cmath.atanh(z)"""
+        # CPython's algorithm (see c_atanh() in cmathmodule.c)
+        if z.real < 0.0:
+            # Reduce to case where z.real >= 0., using atanh(z) = -atanh(-z).
+            negate = True
+            z = -z
+        else:
+            negate = False
+
+        ay = abs(z.imag)
+        if math.isnan(z.real) or z.real > THRES_LARGE or ay > THRES_LARGE:
+            if math.isinf(z.imag):
+                real = math.copysign(0.0, z.real)
+            elif math.isinf(z.real):
+                real = 0.0
+            else:
+                # may be safe from overflow, depending on hypot's implementation...
+                h = math.hypot(z.real * 0.5, z.imag * 0.5)
+                real = z.real / 4.0 / h / h
+            imag = -math.copysign(PI_12, -z.imag)
+        elif z.real == 1.0 and ay < THRES_SMALL:
+            # C99 standard says:  atanh(1+/-0.) should be inf +/- 0j
+            if ay == 0.0:
+                real = INF
+                imag = z.imag
+            else:
+                real = -math.log(math.sqrt(ay) / math.sqrt(math.hypot(ay, 2.0)))
+                imag = math.copysign(math.atan2(2.0, -ay) / 2, z.imag)
+        else:
+            sqay = ay * ay
+            zr1 = 1 - z.real
+            real = math.log1p(4.0 * z.real / (zr1 * zr1 + sqay)) * 0.25
+            imag = -math.atan2(-2.0 * z.imag, zr1 * (1 + z.real) - sqay) * 0.5
+
+        if math.isnan(z.imag):
+            imag = NAN
+        if negate:
+            return complex(-real, -imag)
+        else:
+            return complex(real, imag)
+
+    res = context.compile_internal(builder, atanh_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig, res)
diff --git a/numba_cuda/numba/cuda/np/math/mathimpl.py b/numba_cuda/numba/cuda/np/math/mathimpl.py
new file mode 100644
index 000000000..06471acae
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/math/mathimpl.py
@@ -0,0 +1,487 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Provide math calls that uses intrinsics or libc math functions.
+"""
+
+import math
+import operator
+import sys
+import numpy as np
+
+import llvmlite.ir
+from llvmlite.ir import Constant
+
+from numba.core.imputils import impl_ret_untracked
+from numba.core import types, config
+from numba.cuda import cgutils
+from numba.core.extending import overload
+from numba.cuda.typing import signature
+from numba.cpython.unsafe.numbers import trailing_zeros
+
+
+# registry = Registry('mathimpl')
+# lower = registry.lower
+
+
+# Helpers, shared with cmathimpl.
+_NP_FLT_FINFO = np.finfo(np.dtype("float32"))
+FLT_MAX = _NP_FLT_FINFO.max
+FLT_MIN = _NP_FLT_FINFO.tiny
+
+_NP_DBL_FINFO = np.finfo(np.dtype("float64"))
+DBL_MAX = _NP_DBL_FINFO.max
+DBL_MIN = _NP_DBL_FINFO.tiny
+
+FLOAT_ABS_MASK = 0x7FFFFFFF
+FLOAT_SIGN_MASK = 0x80000000
+DOUBLE_ABS_MASK = 0x7FFFFFFFFFFFFFFF
+DOUBLE_SIGN_MASK = 0x8000000000000000
+
+
+def is_nan(builder, val):
+    """
+    Return a condition testing whether *val* is a NaN.
+    """
+    return builder.fcmp_unordered("uno", val, val)
+
+
+def is_inf(builder, val):
+    """
+    Return a condition testing whether *val* is an infinite.
+    """
+    pos_inf = Constant(val.type, float("+inf"))
+    neg_inf = Constant(val.type, float("-inf"))
+    isposinf = builder.fcmp_ordered("==", val, pos_inf)
+    isneginf = builder.fcmp_ordered("==", val, neg_inf)
+    return builder.or_(isposinf, isneginf)
+
+
+def is_finite(builder, val):
+    """
+    Return a condition testing whether *val* is a finite.
+    """
+    # is_finite(x)  <=>  x - x != NaN
+    val_minus_val = builder.fsub(val, val)
+    return builder.fcmp_ordered("ord", val_minus_val, val_minus_val)
+
+
+def f64_as_int64(builder, val):
+    """
+    Bitcast a double into a 64-bit integer.
+    """
+    assert val.type == llvmlite.ir.DoubleType()
+    return builder.bitcast(val, llvmlite.ir.IntType(64))
+
+
+def int64_as_f64(builder, val):
+    """
+    Bitcast a 64-bit integer into a double.
+    """
+    assert val.type == llvmlite.ir.IntType(64)
+    return builder.bitcast(val, llvmlite.ir.DoubleType())
+
+
+def f32_as_int32(builder, val):
+    """
+    Bitcast a float into a 32-bit integer.
+    """
+    assert val.type == llvmlite.ir.FloatType()
+    return builder.bitcast(val, llvmlite.ir.IntType(32))
+
+
+def int32_as_f32(builder, val):
+    """
+    Bitcast a 32-bit integer into a float.
+    """
+    assert val.type == llvmlite.ir.IntType(32)
+    return builder.bitcast(val, llvmlite.ir.FloatType())
+
+
+def negate_real(builder, val):
+    """
+    Negate real number *val*, with proper handling of zeros.
+    """
+    # The negative zero forces LLVM to handle signed zeros properly.
+    return builder.fsub(Constant(val.type, -0.0), val)
+
+
+def call_fp_intrinsic(builder, name, args):
+    """
+    Call a LLVM intrinsic floating-point operation.
+    """
+    mod = builder.module
+    intr = mod.declare_intrinsic(name, [a.type for a in args])
+    return builder.call(intr, args)
+
+
+def _unary_int_input_wrapper_impl(wrapped_impl):
+    """
+    Return an implementation factory to convert the single integral input
+    argument to a float64, then defer to the *wrapped_impl*.
+    """
+
+    def implementer(context, builder, sig, args):
+        (val,) = args
+        input_type = sig.args[0]
+        fpval = context.cast(builder, val, input_type, types.float64)
+        inner_sig = signature(types.float64, types.float64)
+        res = wrapped_impl(context, builder, inner_sig, (fpval,))
+        return context.cast(builder, res, types.float64, sig.return_type)
+
+    return implementer
+
+
+def unary_math_int_impl(fn, float_impl):
+    impl = _unary_int_input_wrapper_impl(float_impl)  # noqa: F841
+    # lower(fn, types.Integer)(impl)
+
+
+def unary_math_intr(fn, intrcode):
+    """
+    Implement the math function *fn* using the LLVM intrinsic *intrcode*.
+    """
+
+    # @lower(fn, types.Float)
+    def float_impl(context, builder, sig, args):
+        res = call_fp_intrinsic(builder, intrcode, args)
+        return impl_ret_untracked(context, builder, sig.return_type, res)
+
+    unary_math_int_impl(fn, float_impl)
+    return float_impl
+
+
+def unary_math_extern(fn, f32extern, f64extern, int_restype=False):
+    """
+    Register implementations of Python function *fn* using the
+    external function named *f32extern* and *f64extern* (for float32
+    and float64 inputs, respectively).
+    If *int_restype* is true, then the function's return value should be
+    integral, otherwise floating-point.
+    """
+    f_restype = types.int64 if int_restype else None  # noqa: F841
+
+    def float_impl(context, builder, sig, args):
+        """
+        Implement *fn* for a types.Float input.
+        """
+        [val] = args
+        mod = builder.module  # noqa: F841
+        input_type = sig.args[0]
+        lty = context.get_value_type(input_type)
+        func_name = {
+            types.float32: f32extern,
+            types.float64: f64extern,
+        }[input_type]
+        fnty = llvmlite.ir.FunctionType(lty, [lty])
+        fn = cgutils.insert_pure_function(builder.module, fnty, name=func_name)
+        res = builder.call(fn, (val,))
+        res = context.cast(builder, res, input_type, sig.return_type)
+        return impl_ret_untracked(context, builder, sig.return_type, res)
+
+    # lower(fn, types.Float)(float_impl)
+
+    # Implement wrapper for integer inputs
+    unary_math_int_impl(fn, float_impl)
+
+    return float_impl
+
+
+unary_math_intr(math.fabs, "llvm.fabs")
+exp_impl = unary_math_intr(math.exp, "llvm.exp")
+log_impl = unary_math_intr(math.log, "llvm.log")
+log10_impl = unary_math_intr(math.log10, "llvm.log10")
+sin_impl = unary_math_intr(math.sin, "llvm.sin")
+cos_impl = unary_math_intr(math.cos, "llvm.cos")
+
+log1p_impl = unary_math_extern(math.log1p, "log1pf", "log1p")
+expm1_impl = unary_math_extern(math.expm1, "expm1f", "expm1")
+erf_impl = unary_math_extern(math.erf, "erff", "erf")
+erfc_impl = unary_math_extern(math.erfc, "erfcf", "erfc")
+
+tan_impl = unary_math_extern(math.tan, "tanf", "tan")
+asin_impl = unary_math_extern(math.asin, "asinf", "asin")
+acos_impl = unary_math_extern(math.acos, "acosf", "acos")
+atan_impl = unary_math_extern(math.atan, "atanf", "atan")
+
+asinh_impl = unary_math_extern(math.asinh, "asinhf", "asinh")
+acosh_impl = unary_math_extern(math.acosh, "acoshf", "acosh")
+atanh_impl = unary_math_extern(math.atanh, "atanhf", "atanh")
+sinh_impl = unary_math_extern(math.sinh, "sinhf", "sinh")
+cosh_impl = unary_math_extern(math.cosh, "coshf", "cosh")
+tanh_impl = unary_math_extern(math.tanh, "tanhf", "tanh")
+
+log2_impl = unary_math_extern(math.log2, "log2f", "log2")
+ceil_impl = unary_math_extern(math.ceil, "ceilf", "ceil", True)
+floor_impl = unary_math_extern(math.floor, "floorf", "floor", True)
+
+gamma_impl = unary_math_extern(
+    math.gamma, "numba_gammaf", "numba_gamma"
+)  # work-around
+sqrt_impl = unary_math_extern(math.sqrt, "sqrtf", "sqrt")
+trunc_impl = unary_math_extern(math.trunc, "truncf", "trunc", True)
+lgamma_impl = unary_math_extern(math.lgamma, "lgammaf", "lgamma")
+
+
+# @lower(math.isnan, types.Float)
+def isnan_float_impl(context, builder, sig, args):
+    [val] = args
+    res = is_nan(builder, val)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.isnan, types.Integer)
+def isnan_int_impl(context, builder, sig, args):
+    res = cgutils.false_bit
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.isinf, types.Float)
+def isinf_float_impl(context, builder, sig, args):
+    [val] = args
+    res = is_inf(builder, val)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.isinf, types.Integer)
+def isinf_int_impl(context, builder, sig, args):
+    res = cgutils.false_bit
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.isfinite, types.Float)
+def isfinite_float_impl(context, builder, sig, args):
+    [val] = args
+    res = is_finite(builder, val)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.isfinite, types.Integer)
+def isfinite_int_impl(context, builder, sig, args):
+    res = cgutils.true_bit
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.copysign, types.Float, types.Float)
+def copysign_float_impl(context, builder, sig, args):
+    lty = args[0].type
+    mod = builder.module
+    fn = cgutils.get_or_insert_function(
+        mod,
+        llvmlite.ir.FunctionType(lty, (lty, lty)),
+        "llvm.copysign.%s" % lty.intrinsic_name,
+    )
+    res = builder.call(fn, args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# -----------------------------------------------------------------------------
+
+
+# @lower(math.frexp, types.Float)
+def frexp_impl(context, builder, sig, args):
+    (val,) = args
+    fltty = context.get_data_type(sig.args[0])
+    intty = context.get_data_type(sig.return_type[1])
+    expptr = cgutils.alloca_once(builder, intty, name="exp")
+    fnty = llvmlite.ir.FunctionType(
+        fltty, (fltty, llvmlite.ir.PointerType(intty))
+    )
+    fname = {
+        "float": "numba_frexpf",
+        "double": "numba_frexp",
+    }[str(fltty)]
+    fn = cgutils.get_or_insert_function(builder.module, fnty, fname)
+    res = builder.call(fn, (val, expptr))
+    res = cgutils.make_anonymous_struct(builder, (res, builder.load(expptr)))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.ldexp, types.Float, types.intc)
+def ldexp_impl(context, builder, sig, args):
+    val, exp = args
+    fltty, intty = map(context.get_data_type, sig.args)
+    fnty = llvmlite.ir.FunctionType(fltty, (fltty, intty))
+    fname = {
+        "float": "numba_ldexpf",
+        "double": "numba_ldexp",
+    }[str(fltty)]
+    fn = cgutils.insert_pure_function(builder.module, fnty, name=fname)
+    res = builder.call(fn, (val, exp))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# -----------------------------------------------------------------------------
+
+
+# @lower(math.atan2, types.int64, types.int64)
+def atan2_s64_impl(context, builder, sig, args):
+    [y, x] = args
+    y = builder.sitofp(y, llvmlite.ir.DoubleType())
+    x = builder.sitofp(x, llvmlite.ir.DoubleType())
+    fsig = signature(types.float64, types.float64, types.float64)
+    return atan2_float_impl(context, builder, fsig, (y, x))
+
+
+# @lower(math.atan2, types.uint64, types.uint64)
+def atan2_u64_impl(context, builder, sig, args):
+    [y, x] = args
+    y = builder.uitofp(y, llvmlite.ir.DoubleType())
+    x = builder.uitofp(x, llvmlite.ir.DoubleType())
+    fsig = signature(types.float64, types.float64, types.float64)
+    return atan2_float_impl(context, builder, fsig, (y, x))
+
+
+# @lower(math.atan2, types.Float, types.Float)
+def atan2_float_impl(context, builder, sig, args):
+    assert len(args) == 2
+    mod = builder.module  # noqa: F841
+    ty = sig.args[0]
+    lty = context.get_value_type(ty)
+    func_name = {types.float32: "atan2f", types.float64: "atan2"}[ty]
+    fnty = llvmlite.ir.FunctionType(lty, (lty, lty))
+    fn = cgutils.insert_pure_function(builder.module, fnty, name=func_name)
+    res = builder.call(fn, args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# -----------------------------------------------------------------------------
+
+
+# @lower(math.hypot, types.int64, types.int64)
+def hypot_s64_impl(context, builder, sig, args):
+    [x, y] = args
+    y = builder.sitofp(y, llvmlite.ir.DoubleType())
+    x = builder.sitofp(x, llvmlite.ir.DoubleType())
+    fsig = signature(types.float64, types.float64, types.float64)
+    res = hypot_float_impl(context, builder, fsig, (x, y))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.hypot, types.uint64, types.uint64)
+def hypot_u64_impl(context, builder, sig, args):
+    [x, y] = args
+    y = builder.sitofp(y, llvmlite.ir.DoubleType())
+    x = builder.sitofp(x, llvmlite.ir.DoubleType())
+    fsig = signature(types.float64, types.float64, types.float64)
+    res = hypot_float_impl(context, builder, fsig, (x, y))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower(math.hypot, types.Float, types.Float)
+def hypot_float_impl(context, builder, sig, args):
+    xty, yty = sig.args
+    assert xty == yty == sig.return_type
+    x, y = args
+
+    # Windows has alternate names for hypot/hypotf, see
+    # https://msdn.microsoft.com/fr-fr/library/a9yb3dbt%28v=vs.80%29.aspx
+    fname = {
+        types.float32: "_hypotf" if sys.platform == "win32" else "hypotf",
+        types.float64: "_hypot" if sys.platform == "win32" else "hypot",
+    }[xty]
+    plat_hypot = types.ExternalFunction(fname, sig)
+
+    if sys.platform == "win32" and config.MACHINE_BITS == 32:
+        inf = xty(float("inf"))
+
+        def hypot_impl(x, y):
+            if math.isinf(x) or math.isinf(y):
+                return inf
+            return plat_hypot(x, y)
+    else:
+
+        def hypot_impl(x, y):
+            return plat_hypot(x, y)
+
+    res = context.compile_internal(builder, hypot_impl, sig, args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# -----------------------------------------------------------------------------
+
+
+# @lower(math.radians, types.Float)
+def radians_float_impl(context, builder, sig, args):
+    [x] = args
+    coef = context.get_constant(sig.return_type, math.pi / 180)
+    res = builder.fmul(x, coef)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+unary_math_int_impl(math.radians, radians_float_impl)
+
+# -----------------------------------------------------------------------------
+
+
+# @lower(math.degrees, types.Float)
+def degrees_float_impl(context, builder, sig, args):
+    [x] = args
+    coef = context.get_constant(sig.return_type, 180 / math.pi)
+    res = builder.fmul(x, coef)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+unary_math_int_impl(math.degrees, degrees_float_impl)
+
+# -----------------------------------------------------------------------------
+
+
+# @lower(math.pow, types.Float, types.Float)
+# @lower(math.pow, types.Float, types.Integer)
+def pow_impl(context, builder, sig, args):
+    impl = context.get_function(operator.pow, sig)
+    return impl(builder, args)
+
+
+# -----------------------------------------------------------------------------
+
+
+def _unsigned(T):
+    """Convert integer to unsigned integer of equivalent width."""
+    pass
+
+
+@overload(_unsigned)
+def _unsigned_impl(T):
+    if T in types.unsigned_domain:
+        return lambda T: T
+    elif T in types.signed_domain:
+        newT = getattr(types, "uint{}".format(T.bitwidth))
+        return lambda T: newT(T)
+
+
+def gcd_impl(context, builder, sig, args):
+    xty, yty = sig.args
+    assert xty == yty == sig.return_type
+    x, y = args
+
+    def gcd(a, b):
+        """
+        Stein's algorithm, heavily cribbed from Julia implementation.
+        """
+        T = type(a)
+        if a == 0:
+            return abs(b)
+        if b == 0:
+            return abs(a)
+        za = trailing_zeros(a)
+        zb = trailing_zeros(b)
+        k = min(za, zb)
+        # Uses np.*_shift instead of operators due to return types
+        u = _unsigned(abs(np.right_shift(a, za)))
+        v = _unsigned(abs(np.right_shift(b, zb)))
+        while u != v:
+            if u > v:
+                u, v = v, u
+            v -= u
+            v = np.right_shift(v, trailing_zeros(v))
+        r = np.left_shift(T(u), k)
+        return r
+
+    res = context.compile_internal(builder, gcd, sig, args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# lower(math.gcd, types.Integer, types.Integer)(gcd_impl)
diff --git a/numba_cuda/numba/cuda/np/math/numbers.py b/numba_cuda/numba/cuda/np/math/numbers.py
new file mode 100644
index 000000000..9cc23a5c5
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/math/numbers.py
@@ -0,0 +1,1461 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+import math
+import numbers
+
+import numpy as np
+
+from llvmlite import ir
+from llvmlite.ir import Constant
+
+from numba.core.imputils import impl_ret_untracked
+from numba.core import typing, types, errors
+from numba.cuda import cgutils
+from numba.cpython.unsafe.numbers import viewer
+
+
+def _int_arith_flags(rettype):
+    """
+    Return the modifier flags for integer arithmetic.
+    """
+    if rettype.signed:
+        # Ignore the effects of signed overflow.  This is important for
+        # optimization of some indexing operations.  For example
+        # array[i+1] could see `i+1` trigger a signed overflow and
+        # give a negative number.  With Python's indexing, a negative
+        # index is treated differently: its resolution has a runtime cost.
+        # Telling LLVM to ignore signed overflows allows it to optimize
+        # away the check for a negative `i+1` if it knows `i` is positive.
+        return ["nsw"]
+    else:
+        return []
+
+
+def int_add_impl(context, builder, sig, args):
+    [va, vb] = args
+    [ta, tb] = sig.args
+    a = context.cast(builder, va, ta, sig.return_type)
+    b = context.cast(builder, vb, tb, sig.return_type)
+    res = builder.add(a, b, flags=_int_arith_flags(sig.return_type))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_sub_impl(context, builder, sig, args):
+    [va, vb] = args
+    [ta, tb] = sig.args
+    a = context.cast(builder, va, ta, sig.return_type)
+    b = context.cast(builder, vb, tb, sig.return_type)
+    res = builder.sub(a, b, flags=_int_arith_flags(sig.return_type))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_mul_impl(context, builder, sig, args):
+    [va, vb] = args
+    [ta, tb] = sig.args
+    a = context.cast(builder, va, ta, sig.return_type)
+    b = context.cast(builder, vb, tb, sig.return_type)
+    res = builder.mul(a, b, flags=_int_arith_flags(sig.return_type))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_divmod_signed(context, builder, ty, x, y):
+    """
+    Reference Objects/intobject.c
+    xdivy = x / y;
+    xmody = (long)(x - (unsigned long)xdivy * y);
+    /* If the signs of x and y differ, and the remainder is non-0,
+     * C89 doesn't define whether xdivy is now the floor or the
+     * ceiling of the infinitely precise quotient.  We want the floor,
+     * and we have it iff the remainder's sign matches y's.
+     */
+    if (xmody && ((y ^ xmody) < 0) /* i.e. and signs differ */) {
+        xmody += y;
+        --xdivy;
+        assert(xmody && ((y ^ xmody) >= 0));
+    }
+    *p_xdivy = xdivy;
+    *p_xmody = xmody;
+    """
+    assert x.type == y.type
+
+    ZERO = y.type(0)
+    ONE = y.type(1)
+
+    # NOTE: On x86 at least, dividing the lowest representable integer
+    # (e.g. 0x80000000 for int32) by -1 causes a SIFGPE (division overflow),
+    # causing the process to crash.
+    # We return 0, 0 instead (more or less like Numpy).
+
+    resdiv = cgutils.alloca_once_value(builder, ZERO)
+    resmod = cgutils.alloca_once_value(builder, ZERO)
+
+    is_overflow = builder.and_(
+        builder.icmp_signed("==", x, x.type(ty.minval)),
+        builder.icmp_signed("==", y, y.type(-1)),
+    )
+
+    with builder.if_then(builder.not_(is_overflow), likely=True):
+        # Note LLVM will optimize this to a single divmod instruction,
+        # if available on the target CPU (e.g. x86).
+        xdivy = builder.sdiv(x, y)
+        xmody = builder.srem(x, y)
+
+        y_xor_xmody_ltz = builder.icmp_signed("<", builder.xor(y, xmody), ZERO)
+        xmody_istrue = builder.icmp_signed("!=", xmody, ZERO)
+        cond = builder.and_(xmody_istrue, y_xor_xmody_ltz)
+
+        with builder.if_else(cond) as (if_different_signs, if_same_signs):
+            with if_same_signs:
+                builder.store(xdivy, resdiv)
+                builder.store(xmody, resmod)
+
+            with if_different_signs:
+                builder.store(builder.sub(xdivy, ONE), resdiv)
+                builder.store(builder.add(xmody, y), resmod)
+
+    return builder.load(resdiv), builder.load(resmod)
+
+
+def int_divmod(context, builder, ty, x, y):
+    """
+    Integer divmod(x, y).  The caller must ensure that y != 0.
+    """
+    if ty.signed:
+        return int_divmod_signed(context, builder, ty, x, y)
+    else:
+        return builder.udiv(x, y), builder.urem(x, y)
+
+
+def _int_divmod_impl(context, builder, sig, args, zerodiv_message):
+    va, vb = args
+    ta, tb = sig.args
+
+    ty = sig.return_type
+    if isinstance(ty, types.UniTuple):
+        ty = ty.dtype
+    a = context.cast(builder, va, ta, ty)
+    b = context.cast(builder, vb, tb, ty)
+    quot = cgutils.alloca_once(builder, a.type, name="quot")
+    rem = cgutils.alloca_once(builder, a.type, name="rem")
+
+    with builder.if_else(cgutils.is_scalar_zero(builder, b), likely=False) as (
+        if_zero,
+        if_non_zero,
+    ):
+        with if_zero:
+            if not context.error_model.fp_zero_division(
+                builder, (zerodiv_message,)
+            ):
+                # No exception raised => return 0
+                # XXX We should also set the FPU exception status, but
+                # there's no easy way to do that from LLVM.
+                builder.store(b, quot)
+                builder.store(b, rem)
+        with if_non_zero:
+            q, r = int_divmod(context, builder, ty, a, b)
+            builder.store(q, quot)
+            builder.store(r, rem)
+
+    return quot, rem
+
+
+# @lower_builtin(divmod, types.Integer, types.Integer)
+def int_divmod_impl(context, builder, sig, args):
+    quot, rem = _int_divmod_impl(
+        context, builder, sig, args, "integer divmod by zero"
+    )
+
+    return cgutils.pack_array(builder, (builder.load(quot), builder.load(rem)))
+
+
+# @lower_builtin(operator.floordiv, types.Integer, types.Integer)
+# @lower_builtin(operator.ifloordiv, types.Integer, types.Integer)
+def int_floordiv_impl(context, builder, sig, args):
+    quot, rem = _int_divmod_impl(
+        context, builder, sig, args, "integer division by zero"
+    )
+    return builder.load(quot)
+
+
+# @lower_builtin(operator.truediv, types.Integer, types.Integer)
+# @lower_builtin(operator.itruediv, types.Integer, types.Integer)
+def int_truediv_impl(context, builder, sig, args):
+    [va, vb] = args
+    [ta, tb] = sig.args
+    a = context.cast(builder, va, ta, sig.return_type)
+    b = context.cast(builder, vb, tb, sig.return_type)
+    with cgutils.if_zero(builder, b):
+        context.error_model.fp_zero_division(builder, ("division by zero",))
+    res = builder.fdiv(a, b)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower_builtin(operator.mod, types.Integer, types.Integer)
+# @lower_builtin(operator.imod, types.Integer, types.Integer)
+def int_rem_impl(context, builder, sig, args):
+    quot, rem = _int_divmod_impl(
+        context, builder, sig, args, "integer modulo by zero"
+    )
+    return builder.load(rem)
+
+
+def _get_power_zerodiv_return(context, return_type):
+    if (
+        isinstance(return_type, types.Integer)
+        and not context.error_model.raise_on_fp_zero_division
+    ):
+        # If not raising, return 0x8000... when computing 0 ** <negative number>
+        return -1 << (return_type.bitwidth - 1)
+    else:
+        return False
+
+
+def int_power_impl(context, builder, sig, args):
+    """
+    a ^ b, where a is an integer or real, and b an integer
+    """
+    is_integer = isinstance(sig.args[0], types.Integer)
+    tp = sig.return_type
+    zerodiv_return = _get_power_zerodiv_return(context, tp)
+
+    def int_power(a, b):
+        # Ensure computations are done with a large enough width
+        r = tp(1)
+        a = tp(a)
+        if b < 0:
+            invert = True
+            exp = -b
+            if exp < 0:
+                raise OverflowError
+            if is_integer:
+                if a == 0:
+                    if zerodiv_return:
+                        return zerodiv_return
+                    else:
+                        raise ZeroDivisionError(
+                            "0 cannot be raised to a negative power"
+                        )
+                if a != 1 and a != -1:
+                    return 0
+        else:
+            invert = False
+            exp = b
+        if exp > 0x10000:
+            # Optimization cutoff: fallback on the generic algorithm
+            return math.pow(a, float(b))
+        while exp != 0:
+            if exp & 1:
+                r *= a
+            exp >>= 1
+            a *= a
+
+        return 1.0 / r if invert else r
+
+    res = context.compile_internal(builder, int_power, sig, args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower_builtin(operator.pow, types.Integer, types.IntegerLiteral)
+# @lower_builtin(operator.ipow, types.Integer, types.IntegerLiteral)
+# @lower_builtin(operator.pow, types.Float, types.IntegerLiteral)
+# @lower_builtin(operator.ipow, types.Float, types.IntegerLiteral)
+def static_power_impl(context, builder, sig, args):
+    """
+    a ^ b, where a is an integer or real, and b a constant integer
+    """
+    exp = sig.args[1].value
+    if not isinstance(exp, numbers.Integral):
+        raise NotImplementedError
+    if abs(exp) > 0x10000:
+        # Optimization cutoff: fallback on the generic algorithm above
+        raise NotImplementedError
+    invert = exp < 0
+    exp = abs(exp)
+
+    tp = sig.return_type
+    is_integer = isinstance(tp, types.Integer)
+    zerodiv_return = _get_power_zerodiv_return(context, tp)
+
+    val = context.cast(builder, args[0], sig.args[0], tp)
+    lty = val.type
+
+    def mul(a, b):
+        if is_integer:
+            return builder.mul(a, b)
+        else:
+            return builder.fmul(a, b)
+
+    # Unroll the exponentiation loop
+    res = lty(1)
+    while exp != 0:
+        if exp & 1:
+            res = mul(res, val)
+        exp >>= 1
+        val = mul(val, val)
+
+    if invert:
+        # If the exponent was negative, fix the result by inverting it
+        if is_integer:
+            # Integer inversion
+            def invert_impl(a):
+                if a == 0:
+                    if zerodiv_return:
+                        return zerodiv_return
+                    else:
+                        raise ZeroDivisionError(
+                            "0 cannot be raised to a negative power"
+                        )
+                if a != 1 and a != -1:
+                    return 0
+                else:
+                    return a
+
+        else:
+            # Real inversion
+            def invert_impl(a):
+                return 1.0 / a
+
+        res = context.compile_internal(
+            builder, invert_impl, typing.signature(tp, tp), (res,)
+        )
+
+    return res
+
+
+def int_slt_impl(context, builder, sig, args):
+    res = builder.icmp_signed("<", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_sle_impl(context, builder, sig, args):
+    res = builder.icmp_signed("<=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_sgt_impl(context, builder, sig, args):
+    res = builder.icmp_signed(">", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_sge_impl(context, builder, sig, args):
+    res = builder.icmp_signed(">=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_ult_impl(context, builder, sig, args):
+    res = builder.icmp_unsigned("<", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_ule_impl(context, builder, sig, args):
+    res = builder.icmp_unsigned("<=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_ugt_impl(context, builder, sig, args):
+    res = builder.icmp_unsigned(">", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_uge_impl(context, builder, sig, args):
+    res = builder.icmp_unsigned(">=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_eq_impl(context, builder, sig, args):
+    res = builder.icmp_unsigned("==", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_ne_impl(context, builder, sig, args):
+    res = builder.icmp_unsigned("!=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_signed_unsigned_cmp(op):
+    def impl(context, builder, sig, args):
+        (left, right) = args
+        # This code is translated from the NumPy source.
+        # What we're going to do is divide the range of a signed value at zero.
+        # If the signed value is less than zero, then we can treat zero as the
+        # unsigned value since the unsigned value is necessarily zero or larger
+        # and any signed comparison between a negative value and zero/infinity
+        # will yield the same result. If the signed value is greater than or
+        # equal to zero, then we can safely cast it to an unsigned value and do
+        # the expected unsigned-unsigned comparison operation.
+        # Original: https://github.com/numpy/numpy/pull/23713
+        cmp_zero = builder.icmp_signed("<", left, Constant(left.type, 0))
+        lt_zero = builder.icmp_signed(op, left, Constant(left.type, 0))
+        ge_zero = builder.icmp_unsigned(op, left, right)
+        res = builder.select(cmp_zero, lt_zero, ge_zero)
+        return impl_ret_untracked(context, builder, sig.return_type, res)
+
+    return impl
+
+
+def int_unsigned_signed_cmp(op):
+    def impl(context, builder, sig, args):
+        (left, right) = args
+        # See the function `int_signed_unsigned_cmp` for implementation notes.
+        cmp_zero = builder.icmp_signed("<", right, Constant(right.type, 0))
+        lt_zero = builder.icmp_signed(op, Constant(right.type, 0), right)
+        ge_zero = builder.icmp_unsigned(op, left, right)
+        res = builder.select(cmp_zero, lt_zero, ge_zero)
+        return impl_ret_untracked(context, builder, sig.return_type, res)
+
+    return impl
+
+
+def int_abs_impl(context, builder, sig, args):
+    [x] = args
+    ZERO = Constant(x.type, None)
+    ltz = builder.icmp_signed("<", x, ZERO)
+    negated = builder.neg(x)
+    res = builder.select(ltz, negated, x)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def identity_impl(context, builder, sig, args):
+    [x] = args
+    return impl_ret_untracked(context, builder, sig.return_type, x)
+
+
+def uint_abs_impl(context, builder, sig, args):
+    [x] = args
+    return impl_ret_untracked(context, builder, sig.return_type, x)
+
+
+def int_shl_impl(context, builder, sig, args):
+    [valty, amtty] = sig.args
+    [val, amt] = args
+    val = context.cast(builder, val, valty, sig.return_type)
+    amt = context.cast(builder, amt, amtty, sig.return_type)
+    res = builder.shl(val, amt)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_shr_impl(context, builder, sig, args):
+    [valty, amtty] = sig.args
+    [val, amt] = args
+    val = context.cast(builder, val, valty, sig.return_type)
+    amt = context.cast(builder, amt, amtty, sig.return_type)
+    if sig.return_type.signed:
+        res = builder.ashr(val, amt)
+    else:
+        res = builder.lshr(val, amt)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_and_impl(context, builder, sig, args):
+    [at, bt] = sig.args
+    [av, bv] = args
+    cav = context.cast(builder, av, at, sig.return_type)
+    cbc = context.cast(builder, bv, bt, sig.return_type)
+    res = builder.and_(cav, cbc)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_or_impl(context, builder, sig, args):
+    [at, bt] = sig.args
+    [av, bv] = args
+    cav = context.cast(builder, av, at, sig.return_type)
+    cbc = context.cast(builder, bv, bt, sig.return_type)
+    res = builder.or_(cav, cbc)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_xor_impl(context, builder, sig, args):
+    [at, bt] = sig.args
+    [av, bv] = args
+    cav = context.cast(builder, av, at, sig.return_type)
+    cbc = context.cast(builder, bv, bt, sig.return_type)
+    res = builder.xor(cav, cbc)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_negate_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    # Negate before upcasting, for unsigned numbers
+    res = builder.neg(val)
+    res = context.cast(builder, res, typ, sig.return_type)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_positive_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    res = context.cast(builder, val, typ, sig.return_type)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_invert_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    # Invert before upcasting, for unsigned numbers
+    res = builder.xor(val, Constant(val.type, int("1" * val.type.width, 2)))
+    res = context.cast(builder, res, typ, sig.return_type)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def int_sign_impl(context, builder, sig, args):
+    """
+    np.sign(int)
+    """
+    [x] = args
+    POS = Constant(x.type, 1)
+    NEG = Constant(x.type, -1)
+    ZERO = Constant(x.type, 0)
+
+    cmp_zero = builder.icmp_unsigned("==", x, ZERO)
+    cmp_pos = builder.icmp_signed(">", x, ZERO)
+
+    presult = cgutils.alloca_once(builder, x.type)
+
+    bb_zero = builder.append_basic_block(".zero")
+    bb_postest = builder.append_basic_block(".postest")
+    bb_pos = builder.append_basic_block(".pos")
+    bb_neg = builder.append_basic_block(".neg")
+    bb_exit = builder.append_basic_block(".exit")
+
+    builder.cbranch(cmp_zero, bb_zero, bb_postest)
+
+    with builder.goto_block(bb_zero):
+        builder.store(ZERO, presult)
+        builder.branch(bb_exit)
+
+    with builder.goto_block(bb_postest):
+        builder.cbranch(cmp_pos, bb_pos, bb_neg)
+
+    with builder.goto_block(bb_pos):
+        builder.store(POS, presult)
+        builder.branch(bb_exit)
+
+    with builder.goto_block(bb_neg):
+        builder.store(NEG, presult)
+        builder.branch(bb_exit)
+
+    builder.position_at_end(bb_exit)
+    res = builder.load(presult)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def bool_negate_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    res = context.cast(builder, val, typ, sig.return_type)
+    res = builder.neg(res)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def bool_unary_positive_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    res = context.cast(builder, val, typ, sig.return_type)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# lower_builtin(operator.eq, types.boolean, types.boolean)(int_eq_impl)
+# lower_builtin(operator.ne, types.boolean, types.boolean)(int_ne_impl)
+# lower_builtin(operator.lt, types.boolean, types.boolean)(int_ult_impl)
+# lower_builtin(operator.le, types.boolean, types.boolean)(int_ule_impl)
+# lower_builtin(operator.gt, types.boolean, types.boolean)(int_ugt_impl)
+# lower_builtin(operator.ge, types.boolean, types.boolean)(int_uge_impl)
+# lower_builtin(operator.neg, types.boolean)(bool_negate_impl)
+# lower_builtin(operator.pos, types.boolean)(bool_unary_positive_impl)
+
+
+# def _implement_integer_operators():
+#     ty = types.Integer
+
+#     lower_builtin(operator.add, ty, ty)(int_add_impl)
+#     lower_builtin(operator.iadd, ty, ty)(int_add_impl)
+#     lower_builtin(operator.sub, ty, ty)(int_sub_impl)
+#     lower_builtin(operator.isub, ty, ty)(int_sub_impl)
+#     lower_builtin(operator.mul, ty, ty)(int_mul_impl)
+#     lower_builtin(operator.imul, ty, ty)(int_mul_impl)
+#     lower_builtin(operator.eq, ty, ty)(int_eq_impl)
+#     lower_builtin(operator.ne, ty, ty)(int_ne_impl)
+
+#     lower_builtin(operator.lshift, ty, ty)(int_shl_impl)
+#     lower_builtin(operator.ilshift, ty, ty)(int_shl_impl)
+#     lower_builtin(operator.rshift, ty, ty)(int_shr_impl)
+#     lower_builtin(operator.irshift, ty, ty)(int_shr_impl)
+
+#     lower_builtin(operator.neg, ty)(int_negate_impl)
+#     lower_builtin(operator.pos, ty)(int_positive_impl)
+
+#     lower_builtin(operator.pow, ty, ty)(int_power_impl)
+#     lower_builtin(operator.ipow, ty, ty)(int_power_impl)
+#     lower_builtin(pow, ty, ty)(int_power_impl)
+
+#     for ty in types.unsigned_domain:
+#         lower_builtin(operator.lt, ty, ty)(int_ult_impl)
+#         lower_builtin(operator.le, ty, ty)(int_ule_impl)
+#         lower_builtin(operator.gt, ty, ty)(int_ugt_impl)
+#         lower_builtin(operator.ge, ty, ty)(int_uge_impl)
+#         lower_builtin(operator.pow, types.Float, ty)(int_power_impl)
+#         lower_builtin(operator.ipow, types.Float, ty)(int_power_impl)
+#         lower_builtin(pow, types.Float, ty)(int_power_impl)
+#         lower_builtin(abs, ty)(uint_abs_impl)
+
+#     lower_builtin(operator.lt, types.IntegerLiteral, types.IntegerLiteral)(int_slt_impl)
+#     lower_builtin(operator.gt, types.IntegerLiteral, types.IntegerLiteral)(int_slt_impl)
+#     lower_builtin(operator.le, types.IntegerLiteral, types.IntegerLiteral)(int_slt_impl)
+#     lower_builtin(operator.ge, types.IntegerLiteral, types.IntegerLiteral)(int_slt_impl)
+#     for ty in types.signed_domain:
+#         lower_builtin(operator.lt, ty, ty)(int_slt_impl)
+#         lower_builtin(operator.le, ty, ty)(int_sle_impl)
+#         lower_builtin(operator.gt, ty, ty)(int_sgt_impl)
+#         lower_builtin(operator.ge, ty, ty)(int_sge_impl)
+#         lower_builtin(operator.pow, types.Float, ty)(int_power_impl)
+#         lower_builtin(operator.ipow, types.Float, ty)(int_power_impl)
+#         lower_builtin(pow, types.Float, ty)(int_power_impl)
+#         lower_builtin(abs, ty)(int_abs_impl)
+
+# def _implement_bitwise_operators():
+#     for ty in (types.Boolean, types.Integer):
+#         lower_builtin(operator.and_, ty, ty)(int_and_impl)
+#         lower_builtin(operator.iand, ty, ty)(int_and_impl)
+#         lower_builtin(operator.or_, ty, ty)(int_or_impl)
+#         lower_builtin(operator.ior, ty, ty)(int_or_impl)
+#         lower_builtin(operator.xor, ty, ty)(int_xor_impl)
+#         lower_builtin(operator.ixor, ty, ty)(int_xor_impl)
+
+#         lower_builtin(operator.invert, ty)(int_invert_impl)
+
+# _implement_integer_operators()
+
+# _implement_bitwise_operators()
+
+
+def real_add_impl(context, builder, sig, args):
+    res = builder.fadd(*args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_sub_impl(context, builder, sig, args):
+    res = builder.fsub(*args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_mul_impl(context, builder, sig, args):
+    res = builder.fmul(*args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_div_impl(context, builder, sig, args):
+    with cgutils.if_zero(builder, args[1]):
+        context.error_model.fp_zero_division(builder, ("division by zero",))
+    res = builder.fdiv(*args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_divmod(context, builder, x, y):
+    assert x.type == y.type
+    floatty = x.type
+
+    module = builder.module
+    fname = context.mangler(".numba.python.rem", [x.type])
+    fnty = ir.FunctionType(floatty, (floatty, floatty, ir.PointerType(floatty)))
+    fn = cgutils.get_or_insert_function(module, fnty, fname)
+
+    if fn.is_declaration:
+        fn.linkage = "linkonce_odr"
+        fnbuilder = ir.IRBuilder(fn.append_basic_block("entry"))
+        fx, fy, pmod = fn.args
+        div, mod = real_divmod_func_body(context, fnbuilder, fx, fy)
+        fnbuilder.store(mod, pmod)
+        fnbuilder.ret(div)
+
+    pmod = cgutils.alloca_once(builder, floatty)
+    quotient = builder.call(fn, (x, y, pmod))
+    return quotient, builder.load(pmod)
+
+
+def real_divmod_func_body(context, builder, vx, wx):
+    # Reference Objects/floatobject.c
+    #
+    # float_divmod(PyObject *v, PyObject *w)
+    # {
+    #     double vx, wx;
+    #     double div, mod, floordiv;
+    #     CONVERT_TO_DOUBLE(v, vx);
+    #     CONVERT_TO_DOUBLE(w, wx);
+    #     mod = fmod(vx, wx);
+    #     /* fmod is typically exact, so vx-mod is *mathematically* an
+    #        exact multiple of wx.  But this is fp arithmetic, and fp
+    #        vx - mod is an approximation; the result is that div may
+    #        not be an exact integral value after the division, although
+    #        it will always be very close to one.
+    #     */
+    #     div = (vx - mod) / wx;
+    #     if (mod) {
+    #         /* ensure the remainder has the same sign as the denominator */
+    #         if ((wx < 0) != (mod < 0)) {
+    #             mod += wx;
+    #             div -= 1.0;
+    #         }
+    #     }
+    #     else {
+    #         /* the remainder is zero, and in the presence of signed zeroes
+    #            fmod returns different results across platforms; ensure
+    #            it has the same sign as the denominator; we'd like to do
+    #            "mod = wx * 0.0", but that may get optimized away */
+    #         mod *= mod;  /* hide "mod = +0" from optimizer */
+    #         if (wx < 0.0)
+    #             mod = -mod;
+    #     }
+    #     /* snap quotient to nearest integral value */
+    #     if (div) {
+    #         floordiv = floor(div);
+    #         if (div - floordiv > 0.5)
+    #             floordiv += 1.0;
+    #     }
+    #     else {
+    #         /* div is zero - get the same sign as the true quotient */
+    #         div *= div;             /* hide "div = +0" from optimizers */
+    #         floordiv = div * vx / wx; /* zero w/ sign of vx/wx */
+    #     }
+    #     return Py_BuildValue("(dd)", floordiv, mod);
+    # }
+    pmod = cgutils.alloca_once(builder, vx.type)
+    pdiv = cgutils.alloca_once(builder, vx.type)
+    pfloordiv = cgutils.alloca_once(builder, vx.type)
+
+    mod = builder.frem(vx, wx)
+    div = builder.fdiv(builder.fsub(vx, mod), wx)
+
+    builder.store(mod, pmod)
+    builder.store(div, pdiv)
+
+    # Note the use of negative zero for proper negating with `ZERO - x`
+    ZERO = vx.type(0.0)
+    NZERO = vx.type(-0.0)
+    ONE = vx.type(1.0)
+    mod_istrue = builder.fcmp_unordered("!=", mod, ZERO)
+    wx_ltz = builder.fcmp_ordered("<", wx, ZERO)
+    mod_ltz = builder.fcmp_ordered("<", mod, ZERO)
+
+    with builder.if_else(mod_istrue, likely=True) as (
+        if_nonzero_mod,
+        if_zero_mod,
+    ):
+        with if_nonzero_mod:
+            # `mod` is non-zero or NaN
+            # Ensure the remainder has the same sign as the denominator
+            wx_ltz_ne_mod_ltz = builder.icmp_unsigned("!=", wx_ltz, mod_ltz)
+
+            with builder.if_then(wx_ltz_ne_mod_ltz):
+                builder.store(builder.fsub(div, ONE), pdiv)
+                builder.store(builder.fadd(mod, wx), pmod)
+
+        with if_zero_mod:
+            # `mod` is zero, select the proper sign depending on
+            # the denominator's sign
+            mod = builder.select(wx_ltz, NZERO, ZERO)
+            builder.store(mod, pmod)
+
+    del mod, div
+
+    div = builder.load(pdiv)
+    div_istrue = builder.fcmp_ordered("!=", div, ZERO)
+
+    with builder.if_then(div_istrue):
+        realtypemap = {"float": types.float32, "double": types.float64}
+        realtype = realtypemap[str(wx.type)]
+        floorfn = context.get_function(
+            math.floor, typing.signature(realtype, realtype)
+        )
+        floordiv = floorfn(builder, [div])
+        floordivdiff = builder.fsub(div, floordiv)
+        floordivincr = builder.fadd(floordiv, ONE)
+        HALF = Constant(wx.type, 0.5)
+        pred = builder.fcmp_ordered(">", floordivdiff, HALF)
+        floordiv = builder.select(pred, floordivincr, floordiv)
+        builder.store(floordiv, pfloordiv)
+
+    with cgutils.ifnot(builder, div_istrue):
+        div = builder.fmul(div, div)
+        builder.store(div, pdiv)
+        floordiv = builder.fdiv(builder.fmul(div, vx), wx)
+        builder.store(floordiv, pfloordiv)
+
+    return builder.load(pfloordiv), builder.load(pmod)
+
+
+# @lower_builtin(divmod, types.Float, types.Float)
+def real_divmod_impl(context, builder, sig, args, loc=None):
+    x, y = args
+    quot = cgutils.alloca_once(builder, x.type, name="quot")
+    rem = cgutils.alloca_once(builder, x.type, name="rem")
+
+    with builder.if_else(cgutils.is_scalar_zero(builder, y), likely=False) as (
+        if_zero,
+        if_non_zero,
+    ):
+        with if_zero:
+            if not context.error_model.fp_zero_division(
+                builder, ("modulo by zero",), loc
+            ):
+                # No exception raised => compute the nan result,
+                # and set the FP exception word for Numpy warnings.
+                q = builder.fdiv(x, y)
+                r = builder.frem(x, y)
+                builder.store(q, quot)
+                builder.store(r, rem)
+        with if_non_zero:
+            q, r = real_divmod(context, builder, x, y)
+            builder.store(q, quot)
+            builder.store(r, rem)
+
+    return cgutils.pack_array(builder, (builder.load(quot), builder.load(rem)))
+
+
+def real_mod_impl(context, builder, sig, args, loc=None):
+    x, y = args
+    res = cgutils.alloca_once(builder, x.type)
+    with builder.if_else(cgutils.is_scalar_zero(builder, y), likely=False) as (
+        if_zero,
+        if_non_zero,
+    ):
+        with if_zero:
+            if not context.error_model.fp_zero_division(
+                builder, ("modulo by zero",), loc
+            ):
+                # No exception raised => compute the nan result,
+                # and set the FP exception word for Numpy warnings.
+                rem = builder.frem(x, y)
+                builder.store(rem, res)
+        with if_non_zero:
+            _, rem = real_divmod(context, builder, x, y)
+            builder.store(rem, res)
+    return impl_ret_untracked(
+        context, builder, sig.return_type, builder.load(res)
+    )
+
+
+def real_floordiv_impl(context, builder, sig, args, loc=None):
+    x, y = args
+    res = cgutils.alloca_once(builder, x.type)
+    with builder.if_else(cgutils.is_scalar_zero(builder, y), likely=False) as (
+        if_zero,
+        if_non_zero,
+    ):
+        with if_zero:
+            if not context.error_model.fp_zero_division(
+                builder, ("division by zero",), loc
+            ):
+                # No exception raised => compute the +/-inf or nan result,
+                # and set the FP exception word for Numpy warnings.
+                quot = builder.fdiv(x, y)
+                builder.store(quot, res)
+        with if_non_zero:
+            quot, _ = real_divmod(context, builder, x, y)
+            builder.store(quot, res)
+    return impl_ret_untracked(
+        context, builder, sig.return_type, builder.load(res)
+    )
+
+
+def real_power_impl(context, builder, sig, args):
+    x, y = args
+    module = builder.module
+    if context.implement_powi_as_math_call:
+        imp = context.get_function(math.pow, sig)
+        res = imp(builder, args)
+    else:
+        fn = module.declare_intrinsic("llvm.pow", [y.type])
+        res = builder.call(fn, (x, y))
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_lt_impl(context, builder, sig, args):
+    res = builder.fcmp_ordered("<", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_le_impl(context, builder, sig, args):
+    res = builder.fcmp_ordered("<=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_gt_impl(context, builder, sig, args):
+    res = builder.fcmp_ordered(">", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_ge_impl(context, builder, sig, args):
+    res = builder.fcmp_ordered(">=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_eq_impl(context, builder, sig, args):
+    res = builder.fcmp_ordered("==", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_ne_impl(context, builder, sig, args):
+    res = builder.fcmp_unordered("!=", *args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_abs_impl(context, builder, sig, args):
+    [ty] = sig.args
+    sig = typing.signature(ty, ty)
+    impl = context.get_function(math.fabs, sig)
+    return impl(builder, args)
+
+
+def real_negate_impl(context, builder, sig, args):
+    from numba.cpython import mathimpl
+
+    res = mathimpl.negate_real(builder, args[0])
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_positive_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    res = context.cast(builder, val, typ, sig.return_type)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_sign_impl(context, builder, sig, args):
+    """
+    np.sign(float)
+    """
+    [x] = args
+    POS = Constant(x.type, 1)
+    NEG = Constant(x.type, -1)
+    ZERO = Constant(x.type, 0)
+
+    presult = cgutils.alloca_once(builder, x.type)
+
+    is_pos = builder.fcmp_ordered(">", x, ZERO)
+    is_neg = builder.fcmp_ordered("<", x, ZERO)
+
+    with builder.if_else(is_pos) as (gt_zero, not_gt_zero):
+        with gt_zero:
+            builder.store(POS, presult)
+        with not_gt_zero:
+            with builder.if_else(is_neg) as (lt_zero, not_lt_zero):
+                with lt_zero:
+                    builder.store(NEG, presult)
+                with not_lt_zero:
+                    # For both NaN and 0, the result of sign() is simply
+                    # the input value.
+                    builder.store(x, presult)
+
+    res = builder.load(presult)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# ty = types.Float
+
+# lower_builtin(operator.add, ty, ty)(real_add_impl)
+# lower_builtin(operator.iadd, ty, ty)(real_add_impl)
+# lower_builtin(operator.sub, ty, ty)(real_sub_impl)
+# lower_builtin(operator.isub, ty, ty)(real_sub_impl)
+# lower_builtin(operator.mul, ty, ty)(real_mul_impl)
+# lower_builtin(operator.imul, ty, ty)(real_mul_impl)
+# lower_builtin(operator.floordiv, ty, ty)(real_floordiv_impl)
+# lower_builtin(operator.ifloordiv, ty, ty)(real_floordiv_impl)
+# lower_builtin(operator.truediv, ty, ty)(real_div_impl)
+# lower_builtin(operator.itruediv, ty, ty)(real_div_impl)
+# lower_builtin(operator.mod, ty, ty)(real_mod_impl)
+# lower_builtin(operator.imod, ty, ty)(real_mod_impl)
+# lower_builtin(operator.pow, ty, ty)(real_power_impl)
+# lower_builtin(operator.ipow, ty, ty)(real_power_impl)
+# lower_builtin(pow, ty, ty)(real_power_impl)
+
+# lower_builtin(operator.eq, ty, ty)(real_eq_impl)
+# lower_builtin(operator.ne, ty, ty)(real_ne_impl)
+# lower_builtin(operator.lt, ty, ty)(real_lt_impl)
+# lower_builtin(operator.le, ty, ty)(real_le_impl)
+# lower_builtin(operator.gt, ty, ty)(real_gt_impl)
+# lower_builtin(operator.ge, ty, ty)(real_ge_impl)
+
+# lower_builtin(abs, ty)(real_abs_impl)
+
+# lower_builtin(operator.neg, ty)(real_negate_impl)
+# lower_builtin(operator.pos, ty)(real_positive_impl)
+
+# del ty
+
+
+# @lower_getattr(types.Complex, "real")
+def complex_real_impl(context, builder, typ, value):
+    cplx = context.make_complex(builder, typ, value=value)
+    res = cplx.real
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+# @lower_getattr(types.Complex, "imag")
+def complex_imag_impl(context, builder, typ, value):
+    cplx = context.make_complex(builder, typ, value=value)
+    res = cplx.imag
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+# @lower_builtin("complex.conjugate", types.Complex)
+def complex_conjugate_impl(context, builder, sig, args):
+    from numba.cpython import mathimpl
+
+    z = context.make_complex(builder, sig.args[0], args[0])
+    z.imag = mathimpl.negate_real(builder, z.imag)
+    res = z._getvalue()
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def real_real_impl(context, builder, typ, value):
+    return impl_ret_untracked(context, builder, typ, value)
+
+
+def real_imag_impl(context, builder, typ, value):
+    res = cgutils.get_null_value(value.type)
+    return impl_ret_untracked(context, builder, typ, res)
+
+
+def real_conjugate_impl(context, builder, sig, args):
+    return impl_ret_untracked(context, builder, sig.return_type, args[0])
+
+
+# for cls in (types.Float, types.Integer):
+#     lower_getattr(cls, "real")(real_real_impl)
+#     lower_getattr(cls, "imag")(real_imag_impl)
+#     lower_builtin("complex.conjugate", cls)(real_conjugate_impl)
+
+
+# @lower_builtin(operator.pow, types.Complex, types.Complex)
+# @lower_builtin(operator.ipow, types.Complex, types.Complex)
+# @lower_builtin(pow, types.Complex, types.Complex)
+def complex_power_impl(context, builder, sig, args):
+    [ca, cb] = args
+    ty = sig.args[0]
+    fty = ty.underlying_float
+    a = context.make_helper(builder, ty, value=ca)
+    b = context.make_helper(builder, ty, value=cb)
+    c = context.make_helper(builder, ty)
+    module = builder.module
+    pa = a._getpointer()
+    pb = b._getpointer()
+    pc = c._getpointer()
+
+    # Optimize for square because cpow loses a lot of precision
+    TWO = context.get_constant(fty, 2)
+    ZERO = context.get_constant(fty, 0)
+
+    b_real_is_two = builder.fcmp_ordered("==", b.real, TWO)
+    b_imag_is_zero = builder.fcmp_ordered("==", b.imag, ZERO)
+    b_is_two = builder.and_(b_real_is_two, b_imag_is_zero)
+
+    with builder.if_else(b_is_two) as (then, otherwise):
+        with then:
+            # Lower as multiplication
+            res = complex_mul_impl(context, builder, sig, (ca, ca))
+            cres = context.make_helper(builder, ty, value=res)
+            c.real = cres.real
+            c.imag = cres.imag
+
+        with otherwise:
+            # Lower with call to external function
+            func_name = {
+                types.complex64: "numba_cpowf",
+                types.complex128: "numba_cpow",
+            }[ty]
+            fnty = ir.FunctionType(ir.VoidType(), [pa.type] * 3)
+            cpow = cgutils.get_or_insert_function(module, fnty, func_name)
+            builder.call(cpow, (pa, pb, pc))
+
+    res = builder.load(pc)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def complex_add_impl(context, builder, sig, args):
+    [cx, cy] = args
+    ty = sig.args[0]
+    x = context.make_complex(builder, ty, value=cx)
+    y = context.make_complex(builder, ty, value=cy)
+    z = context.make_complex(builder, ty)
+    a = x.real
+    b = x.imag
+    c = y.real
+    d = y.imag
+    z.real = builder.fadd(a, c)
+    z.imag = builder.fadd(b, d)
+    res = z._getvalue()
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def complex_sub_impl(context, builder, sig, args):
+    [cx, cy] = args
+    ty = sig.args[0]
+    x = context.make_complex(builder, ty, value=cx)
+    y = context.make_complex(builder, ty, value=cy)
+    z = context.make_complex(builder, ty)
+    a = x.real
+    b = x.imag
+    c = y.real
+    d = y.imag
+    z.real = builder.fsub(a, c)
+    z.imag = builder.fsub(b, d)
+    res = z._getvalue()
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def complex_mul_impl(context, builder, sig, args):
+    """
+    (a+bi)(c+di)=(ac-bd)+i(ad+bc)
+    """
+    [cx, cy] = args
+    ty = sig.args[0]
+    x = context.make_complex(builder, ty, value=cx)
+    y = context.make_complex(builder, ty, value=cy)
+    z = context.make_complex(builder, ty)
+    a = x.real
+    b = x.imag
+    c = y.real
+    d = y.imag
+    ac = builder.fmul(a, c)
+    bd = builder.fmul(b, d)
+    ad = builder.fmul(a, d)
+    bc = builder.fmul(b, c)
+    z.real = builder.fsub(ac, bd)
+    z.imag = builder.fadd(ad, bc)
+    res = z._getvalue()
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+NAN = float("nan")
+
+
+def complex_div_impl(context, builder, sig, args):
+    def complex_div(a, b):
+        # This is CPython's algorithm (in _Py_c_quot()).
+        areal = a.real
+        aimag = a.imag
+        breal = b.real
+        bimag = b.imag
+        if not breal and not bimag:
+            raise ZeroDivisionError("complex division by zero")
+        if abs(breal) >= abs(bimag):
+            # Divide tops and bottom by b.real
+            if not breal:
+                return complex(NAN, NAN)
+            ratio = bimag / breal
+            denom = breal + bimag * ratio
+            return complex(
+                (areal + aimag * ratio) / denom, (aimag - areal * ratio) / denom
+            )
+        else:
+            # Divide tops and bottom by b.imag
+            if not bimag:
+                return complex(NAN, NAN)
+            ratio = breal / bimag
+            denom = breal * ratio + bimag
+            return complex(
+                (a.real * ratio + a.imag) / denom,
+                (a.imag * ratio - a.real) / denom,
+            )
+
+    res = context.compile_internal(builder, complex_div, sig, args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def complex_negate_impl(context, builder, sig, args):
+    from numba.cpython import mathimpl
+
+    [typ] = sig.args
+    [val] = args
+    cmplx = context.make_complex(builder, typ, value=val)
+    res = context.make_complex(builder, typ)
+    res.real = mathimpl.negate_real(builder, cmplx.real)
+    res.imag = mathimpl.negate_real(builder, cmplx.imag)
+    res = res._getvalue()
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def complex_positive_impl(context, builder, sig, args):
+    [val] = args
+    return impl_ret_untracked(context, builder, sig.return_type, val)
+
+
+def complex_eq_impl(context, builder, sig, args):
+    [cx, cy] = args
+    typ = sig.args[0]
+    x = context.make_complex(builder, typ, value=cx)
+    y = context.make_complex(builder, typ, value=cy)
+
+    reals_are_eq = builder.fcmp_ordered("==", x.real, y.real)
+    imags_are_eq = builder.fcmp_ordered("==", x.imag, y.imag)
+    res = builder.and_(reals_are_eq, imags_are_eq)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def complex_ne_impl(context, builder, sig, args):
+    [cx, cy] = args
+    typ = sig.args[0]
+    x = context.make_complex(builder, typ, value=cx)
+    y = context.make_complex(builder, typ, value=cy)
+
+    reals_are_ne = builder.fcmp_unordered("!=", x.real, y.real)
+    imags_are_ne = builder.fcmp_unordered("!=", x.imag, y.imag)
+    res = builder.or_(reals_are_ne, imags_are_ne)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+def complex_abs_impl(context, builder, sig, args):
+    """
+    abs(z) := hypot(z.real, z.imag)
+    """
+
+    def complex_abs(z):
+        return math.hypot(z.real, z.imag)
+
+    res = context.compile_internal(builder, complex_abs, sig, args)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# ty = types.Complex
+
+# lower_builtin(operator.add, ty, ty)(complex_add_impl)
+# lower_builtin(operator.iadd, ty, ty)(complex_add_impl)
+# lower_builtin(operator.sub, ty, ty)(complex_sub_impl)
+# lower_builtin(operator.isub, ty, ty)(complex_sub_impl)
+# lower_builtin(operator.mul, ty, ty)(complex_mul_impl)
+# lower_builtin(operator.imul, ty, ty)(complex_mul_impl)
+# lower_builtin(operator.truediv, ty, ty)(complex_div_impl)
+# lower_builtin(operator.itruediv, ty, ty)(complex_div_impl)
+# lower_builtin(operator.neg, ty)(complex_negate_impl)
+# lower_builtin(operator.pos, ty)(complex_positive_impl)
+# # Complex modulo is deprecated in python3
+
+# lower_builtin(operator.eq, ty, ty)(complex_eq_impl)
+# lower_builtin(operator.ne, ty, ty)(complex_ne_impl)
+
+# lower_builtin(abs, ty)(complex_abs_impl)
+
+# del ty
+
+
+# @lower_builtin("number.item", types.Boolean)
+# @lower_builtin("number.item", types.Number)
+def number_item_impl(context, builder, sig, args):
+    """
+    The no-op .item() method on booleans and numbers.
+    """
+    return args[0]
+
+
+# ------------------------------------------------------------------------------
+
+
+def number_not_impl(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    istrue = context.cast(builder, val, typ, sig.return_type)
+    res = builder.not_(istrue)
+    return impl_ret_untracked(context, builder, sig.return_type, res)
+
+
+# @lower_builtin(bool, types.Boolean)
+def bool_as_bool(context, builder, sig, args):
+    [val] = args
+    return val
+
+
+# @lower_builtin(bool, types.Integer)
+def int_as_bool(context, builder, sig, args):
+    [val] = args
+    return builder.icmp_unsigned("!=", val, Constant(val.type, 0))
+
+
+# @lower_builtin(bool, types.Float)
+def float_as_bool(context, builder, sig, args):
+    [val] = args
+    return builder.fcmp_unordered("!=", val, Constant(val.type, 0.0))
+
+
+# @lower_builtin(bool, types.Complex)
+def complex_as_bool(context, builder, sig, args):
+    [typ] = sig.args
+    [val] = args
+    cmplx = context.make_complex(builder, typ, val)
+    real, imag = cmplx.real, cmplx.imag
+    zero = Constant(real.type, 0.0)
+    real_istrue = builder.fcmp_unordered("!=", real, zero)
+    imag_istrue = builder.fcmp_unordered("!=", imag, zero)
+    return builder.or_(real_istrue, imag_istrue)
+
+
+# for ty in (types.Integer, types.Float, types.Complex):
+#     lower_builtin(operator.not_, ty)(number_not_impl)
+
+# lower_builtin(operator.not_, types.boolean)(number_not_impl)
+
+
+# ------------------------------------------------------------------------------
+# Hashing numbers, see hashing.py
+
+# -------------------------------------------------------------------------------
+# Implicit casts between numerics
+
+
+# @lower_cast(types.IntegerLiteral, types.Integer)
+# @lower_cast(types.IntegerLiteral, types.Float)
+# @lower_cast(types.IntegerLiteral, types.Complex)
+def literal_int_to_number(context, builder, fromty, toty, val):
+    lit = context.get_constant_generic(
+        builder,
+        fromty.literal_type,
+        fromty.literal_value,
+    )
+    return context.cast(builder, lit, fromty.literal_type, toty)
+
+
+# @lower_cast(types.Integer, types.Integer)
+def integer_to_integer(context, builder, fromty, toty, val):
+    if toty.bitwidth == fromty.bitwidth:
+        # Just a change of signedness
+        return val
+    elif toty.bitwidth < fromty.bitwidth:
+        # Downcast
+        return builder.trunc(val, context.get_value_type(toty))
+    elif fromty.signed:
+        # Signed upcast
+        return builder.sext(val, context.get_value_type(toty))
+    else:
+        # Unsigned upcast
+        return builder.zext(val, context.get_value_type(toty))
+
+
+# @lower_cast(types.Integer, types.voidptr)
+def integer_to_voidptr(context, builder, fromty, toty, val):
+    return builder.inttoptr(val, context.get_value_type(toty))
+
+
+# @lower_cast(types.Float, types.Float)
+def float_to_float(context, builder, fromty, toty, val):
+    lty = context.get_value_type(toty)
+    if fromty.bitwidth < toty.bitwidth:
+        return builder.fpext(val, lty)
+    else:
+        return builder.fptrunc(val, lty)
+
+
+# @lower_cast(types.Integer, types.Float)
+def integer_to_float(context, builder, fromty, toty, val):
+    lty = context.get_value_type(toty)
+    if fromty.signed:
+        return builder.sitofp(val, lty)
+    else:
+        return builder.uitofp(val, lty)
+
+
+# @lower_cast(types.Float, types.Integer)
+def float_to_integer(context, builder, fromty, toty, val):
+    lty = context.get_value_type(toty)
+    if toty.signed:
+        return builder.fptosi(val, lty)
+    else:
+        return builder.fptoui(val, lty)
+
+
+# @lower_cast(types.Float, types.Complex)
+# @lower_cast(types.Integer, types.Complex)
+def non_complex_to_complex(context, builder, fromty, toty, val):
+    real = context.cast(builder, val, fromty, toty.underlying_float)
+    imag = context.get_constant(toty.underlying_float, 0)
+
+    cmplx = context.make_complex(builder, toty)
+    cmplx.real = real
+    cmplx.imag = imag
+    return cmplx._getvalue()
+
+
+# @lower_cast(types.Complex, types.Complex)
+def complex_to_complex(context, builder, fromty, toty, val):
+    srcty = fromty.underlying_float
+    dstty = toty.underlying_float
+
+    src = context.make_complex(builder, fromty, value=val)
+    dst = context.make_complex(builder, toty)
+    dst.real = context.cast(builder, src.real, srcty, dstty)
+    dst.imag = context.cast(builder, src.imag, srcty, dstty)
+    return dst._getvalue()
+
+
+# @lower_cast(types.Any, types.Boolean)
+def any_to_boolean(context, builder, fromty, toty, val):
+    return context.is_true(builder, fromty, val)
+
+
+# @lower_cast(types.Boolean, types.Number)
+def boolean_to_any(context, builder, fromty, toty, val):
+    # Casting from boolean to anything first casts to int32
+    asint = builder.zext(val, ir.IntType(32))
+    return context.cast(builder, asint, types.int32, toty)
+
+
+# @lower_cast(types.IntegerLiteral, types.Boolean)
+# @lower_cast(types.BooleanLiteral, types.Boolean)
+def literal_int_to_boolean(context, builder, fromty, toty, val):
+    lit = context.get_constant_generic(
+        builder,
+        fromty.literal_type,
+        fromty.literal_value,
+    )
+    return context.is_true(builder, fromty.literal_type, lit)
+
+
+# -------------------------------------------------------------------------------
+# Constants
+
+
+# @lower_constant(types.Complex)
+def constant_complex(context, builder, ty, pyval):
+    fty = ty.underlying_float
+    real = context.get_constant_generic(builder, fty, pyval.real)
+    imag = context.get_constant_generic(builder, fty, pyval.imag)
+    return Constant.literal_struct((real, imag))
+
+
+# @lower_constant(types.Integer)
+# @lower_constant(types.Float)
+# @lower_constant(types.Boolean)
+def constant_integer(context, builder, ty, pyval):
+    # See https://github.com/numba/numba/issues/6979
+    # llvmlite ir.IntType specialises the formatting of the constant for a
+    # cpython bool. A NumPy np.bool_ is not a cpython bool so force it to be one
+    # so that the constant renders correctly!
+    if isinstance(pyval, np.bool_):
+        pyval = bool(pyval)
+    lty = context.get_value_type(ty)
+    return lty(pyval)
+
+
+# -------------------------------------------------------------------------------
+# View
+
+
+def scalar_view(scalar, viewty):
+    """Typing for the np scalar 'view' method."""
+    if isinstance(scalar, (types.Float, types.Integer)) and isinstance(
+        viewty, types.abstract.DTypeSpec
+    ):
+        if scalar.bitwidth != viewty.dtype.bitwidth:
+            raise errors.TypingError(
+                "Changing the dtype of a 0d array is only supported if the "
+                "itemsize is unchanged"
+            )
+
+        def impl(scalar, viewty):
+            return viewer(scalar, viewty)
+
+        return impl
+
+
+# overload_method(types.Float, 'view')(scalar_view)
+# overload_method(types.Integer, 'view')(scalar_view)
diff --git a/numba_cuda/numba/cuda/np/npyfuncs.py b/numba_cuda/numba/cuda/np/npyfuncs.py
index 84eec9ad7..5dc2619ee 100644
--- a/numba_cuda/numba/cuda/np/npyfuncs.py
+++ b/numba_cuda/numba/cuda/np/npyfuncs.py
@@ -18,8 +18,8 @@
 from numba.cuda import cgutils
 from numba.cuda.extending import register_jitable
 from numba.np import npdatetime
-from numba.np.math import cmathimpl, mathimpl, numbers
-from numba.np.numpy_support import numpy_version
+from numba.cuda.np.math import cmathimpl, mathimpl, numbers
+from numba.cuda.np.numpy_support import numpy_version
 
 # some NumPy constants. Note that we could generate some of them using
 # the math library, but having the values copied from npy_math seems to
diff --git a/numba_cuda/numba/cuda/np/npyimpl.py b/numba_cuda/numba/cuda/np/npyimpl.py
new file mode 100644
index 000000000..62e99a7e5
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/npyimpl.py
@@ -0,0 +1,1027 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Implementation of functions in the Numpy package.
+"""
+
+import itertools
+from collections import namedtuple
+
+import llvmlite.ir as ir
+
+import numpy as np
+import operator
+
+from numba.cuda.np import arrayobj
+from numba.cuda.np import ufunc_db
+from numba.cuda.np.ufunc.sigparse import parse_signature
+from numba.core.imputils import (
+    Registry,
+    impl_ret_new_ref,
+    force_error_model,
+    impl_ret_borrowed,
+)
+from numba.core import typing, types
+from numba.cuda import cgutils
+from numba.cuda.np.numpy_support import (
+    select_array_wrapper,
+    from_dtype,
+    _ufunc_loop_sig,
+)
+from numba.np.numpy_support import ufunc_find_matching_loop
+from numba.cuda.np.arrayobj import _getitem_array_generic
+from numba.core.typing import npydecl
+from numba.core.extending import overload, intrinsic
+
+from numba.core import errors
+
+registry = Registry("npyimpl")
+
+
+########################################################################
+
+# In the way we generate code, ufuncs work with scalar as well as
+# with array arguments. The following helper classes help dealing
+# with scalar and array arguments in a regular way.
+#
+# In short, the classes provide a uniform interface. The interface
+# handles the indexing of as many dimensions as the array may have.
+# For scalars, all indexing is ignored and when the value is read,
+# the scalar is returned. For arrays code for actual indexing is
+# generated and reading performs the appropriate indirection.
+
+
+class _ScalarIndexingHelper(object):
+    def update_indices(self, loop_indices, name):
+        pass
+
+    def as_values(self):
+        pass
+
+
+class _ScalarHelper(object):
+    """Helper class to handle scalar arguments (and result).
+    Note that store_data is only used when generating code for
+    a scalar ufunc and to write the output value.
+
+    For loading, the value is directly used without having any
+    kind of indexing nor memory backing it up. This is the use
+    for input arguments.
+
+    For storing, a variable is created in the stack where the
+    value will be written.
+
+    Note that it is not supported (as it is unneeded for our
+    current use-cases) reading back a stored value. This class
+    will always "load" the original value it got at its creation.
+    """
+
+    def __init__(self, ctxt, bld, val, ty):
+        self.context = ctxt
+        self.builder = bld
+        self.val = val
+        self.base_type = ty
+        intpty = ctxt.get_value_type(types.intp)
+        self.shape = [ir.Constant(intpty, 1)]
+
+        lty = ctxt.get_data_type(ty) if ty != types.boolean else ir.IntType(1)
+        self._ptr = cgutils.alloca_once(bld, lty)
+
+    def create_iter_indices(self):
+        return _ScalarIndexingHelper()
+
+    def load_data(self, indices):
+        return self.val
+
+    def store_data(self, indices, val):
+        self.builder.store(val, self._ptr)
+
+    @property
+    def return_val(self):
+        return self.builder.load(self._ptr)
+
+
+class _ArrayIndexingHelper(
+    namedtuple("_ArrayIndexingHelper", ("array", "indices"))
+):
+    def update_indices(self, loop_indices, name):
+        bld = self.array.builder
+        intpty = self.array.context.get_value_type(types.intp)
+        ONE = ir.Constant(ir.IntType(intpty.width), 1)
+
+        # we are only interested in as many inner dimensions as dimensions
+        # the indexed array has (the outer dimensions are broadcast, so
+        # ignoring the outer indices produces the desired result.
+        indices = loop_indices[len(loop_indices) - len(self.indices) :]
+        for src, dst, dim in zip(indices, self.indices, self.array.shape):
+            cond = bld.icmp_unsigned(">", dim, ONE)
+            with bld.if_then(cond):
+                bld.store(src, dst)
+
+    def as_values(self):
+        """
+        The indexing helper is built using alloca for each value, so it
+        actually contains pointers to the actual indices to load. Note
+        that update_indices assumes the same. This method returns the
+        indices as values
+        """
+        bld = self.array.builder
+        return [bld.load(index) for index in self.indices]
+
+
+class _ArrayHelper(
+    namedtuple(
+        "_ArrayHelper",
+        (
+            "context",
+            "builder",
+            "shape",
+            "strides",
+            "data",
+            "layout",
+            "base_type",
+            "ndim",
+            "return_val",
+        ),
+    )
+):
+    """Helper class to handle array arguments/result.
+    It provides methods to generate code loading/storing specific
+    items as well as support code for handling indices.
+    """
+
+    def create_iter_indices(self):
+        intpty = self.context.get_value_type(types.intp)
+        ZERO = ir.Constant(ir.IntType(intpty.width), 0)
+
+        indices = []
+        for i in range(self.ndim):
+            x = cgutils.alloca_once(self.builder, ir.IntType(intpty.width))
+            self.builder.store(ZERO, x)
+            indices.append(x)
+        return _ArrayIndexingHelper(self, indices)
+
+    def _load_effective_address(self, indices):
+        return cgutils.get_item_pointer2(
+            self.context,
+            self.builder,
+            data=self.data,
+            shape=self.shape,
+            strides=self.strides,
+            layout=self.layout,
+            inds=indices,
+        )
+
+    def load_data(self, indices):
+        model = self.context.data_model_manager[self.base_type]
+        ptr = self._load_effective_address(indices)
+        return model.load_from_data_pointer(self.builder, ptr)
+
+    def store_data(self, indices, value):
+        ctx = self.context
+        bld = self.builder
+        store_value = ctx.get_value_as_data(bld, self.base_type, value)
+        assert ctx.get_data_type(self.base_type) == store_value.type
+        bld.store(store_value, self._load_effective_address(indices))
+
+
+class _ArrayGUHelper(
+    namedtuple(
+        "_ArrayHelper",
+        (
+            "context",
+            "builder",
+            "shape",
+            "strides",
+            "data",
+            "layout",
+            "base_type",
+            "ndim",
+            "inner_arr_ty",
+            "is_input_arg",
+        ),
+    )
+):
+    """Helper class to handle array arguments/result.
+    It provides methods to generate code loading/storing specific
+    items as well as support code for handling indices.
+
+    Contrary to _ArrayHelper, this class can create a view to a subarray
+    """
+
+    def create_iter_indices(self):
+        intpty = self.context.get_value_type(types.intp)
+        ZERO = ir.Constant(ir.IntType(intpty.width), 0)
+
+        indices = []
+        for i in range(self.ndim - self.inner_arr_ty.ndim):
+            x = cgutils.alloca_once(self.builder, ir.IntType(intpty.width))
+            self.builder.store(ZERO, x)
+            indices.append(x)
+        return _ArrayIndexingHelper(self, indices)
+
+    def _load_effective_address(self, indices):
+        context = self.context
+        builder = self.builder
+        arr_ty = types.Array(self.base_type, self.ndim, self.layout)
+        arr = context.make_array(arr_ty)(context, builder, self.data)
+
+        return cgutils.get_item_pointer2(
+            context,
+            builder,
+            data=arr.data,
+            shape=self.shape,
+            strides=self.strides,
+            layout=self.layout,
+            inds=indices,
+        )
+
+    def load_data(self, indices):
+        context, builder = self.context, self.builder
+
+        if self.inner_arr_ty.ndim == 0 and self.is_input_arg:
+            # scalar case for input arguments
+            model = context.data_model_manager[self.base_type]
+            ptr = self._load_effective_address(indices)
+            return model.load_from_data_pointer(builder, ptr)
+        elif self.inner_arr_ty.ndim == 0 and not self.is_input_arg:
+            # Output arrays are handled as 1d with shape=(1,) when its
+            # signature represents a scalar. For instance: "(n),(m) -> ()"
+            intpty = context.get_value_type(types.intp)
+            one = intpty(1)
+
+            fromty = types.Array(self.base_type, self.ndim, self.layout)
+            toty = types.Array(self.base_type, 1, self.layout)
+            itemsize = intpty(arrayobj.get_itemsize(context, fromty))
+
+            # create a view from the original ndarray to a 1d array
+            arr_from = self.context.make_array(fromty)(
+                context, builder, self.data
+            )
+            arr_to = self.context.make_array(toty)(context, builder)
+            arrayobj.populate_array(
+                arr_to,
+                data=self._load_effective_address(indices),
+                shape=cgutils.pack_array(builder, [one]),
+                strides=cgutils.pack_array(builder, [itemsize]),
+                itemsize=arr_from.itemsize,
+                meminfo=arr_from.meminfo,
+                parent=arr_from.parent,
+            )
+            return arr_to._getvalue()
+        else:
+            # generic case
+            # getitem n-dim array -> m-dim array, where N > M
+            index_types = (types.int64,) * (self.ndim - self.inner_arr_ty.ndim)
+            arrty = types.Array(self.base_type, self.ndim, self.layout)
+            arr = self.context.make_array(arrty)(context, builder, self.data)
+            res = _getitem_array_generic(
+                context,
+                builder,
+                self.inner_arr_ty,
+                arrty,
+                arr,
+                index_types,
+                indices,
+            )
+            return impl_ret_borrowed(context, builder, self.inner_arr_ty, res)
+
+    def guard_shape(self, loopshape):
+        inner_ndim = self.inner_arr_ty.ndim
+
+        def raise_impl(loop_shape, array_shape):
+            # This would in fact be a test for broadcasting.
+            # Broadcast would fail if, ignoring the core dimensions, the
+            # remaining ones are different than indices given by loop shape.
+
+            remaining = len(array_shape) - inner_ndim
+            _raise = remaining > len(loop_shape)
+            if not _raise:
+                for i in range(remaining):
+                    _raise |= array_shape[i] != loop_shape[i]
+            if _raise:
+                # Ideally we should call `np.broadcast_shapes` with loop and
+                # array shapes. But since broadcasting is not supported here,
+                # we just raise an error
+                # TODO: check why raising a dynamic exception here fails
+                raise ValueError("Loop and array shapes are incompatible")
+
+        context, builder = self.context, self.builder
+        sig = types.none(
+            types.UniTuple(types.intp, len(loopshape)),
+            types.UniTuple(types.intp, len(self.shape)),
+        )
+        tup = (
+            context.make_tuple(builder, sig.args[0], loopshape),
+            context.make_tuple(builder, sig.args[1], self.shape),
+        )
+        context.compile_internal(builder, raise_impl, sig, tup)
+
+    def guard_match_core_dims(self, other: "_ArrayGUHelper", ndims: int):
+        # arguments with the same signature should match their core dimensions
+        #
+        # @guvectorize('(n,m), (n,m) -> (n)')
+        # def foo(x, y, res):
+        #     ...
+        #
+        # x and y should have the same core (2D) dimensions
+        def raise_impl(self_shape, other_shape):
+            same = True
+            a, b = len(self_shape) - ndims, len(other_shape) - ndims
+            for i in range(ndims):
+                same &= self_shape[a + i] == other_shape[b + i]
+            if not same:
+                # NumPy raises the following:
+                # ValueError: gufunc: Input operand 1 has a mismatch in its
+                # core dimension 0, with gufunc signature (n),(n) -> ()
+                # (size 3 is different from 2)
+                # But since we cannot raise a dynamic exception here, we just
+                # (try) something meaninful
+                msg = (
+                    "Operand has a mismatch in one of its core dimensions. "
+                    "Please, check if all arguments to a @guvectorize "
+                    "function have the same core dimensions."
+                )
+                raise ValueError(msg)
+
+        context, builder = self.context, self.builder
+        sig = types.none(
+            types.UniTuple(types.intp, len(self.shape)),
+            types.UniTuple(types.intp, len(other.shape)),
+        )
+        tup = (
+            context.make_tuple(builder, sig.args[0], self.shape),
+            context.make_tuple(builder, sig.args[1], other.shape),
+        )
+        context.compile_internal(builder, raise_impl, sig, tup)
+
+
+def _prepare_argument(ctxt, bld, inp, tyinp, where="input operand"):
+    """returns an instance of the appropriate Helper (either
+    _ScalarHelper or _ArrayHelper) class to handle the argument.
+    using the polymorphic interface of the Helper classes, scalar
+    and array cases can be handled with the same code"""
+
+    # first un-Optional Optionals
+    if isinstance(tyinp, types.Optional):
+        oty = tyinp
+        tyinp = tyinp.type
+        inp = ctxt.cast(bld, inp, oty, tyinp)
+
+    # then prepare the arg for a concrete instance
+    if isinstance(tyinp, types.ArrayCompatible):
+        ary = ctxt.make_array(tyinp)(ctxt, bld, inp)
+        shape = cgutils.unpack_tuple(bld, ary.shape, tyinp.ndim)
+        strides = cgutils.unpack_tuple(bld, ary.strides, tyinp.ndim)
+        return _ArrayHelper(
+            ctxt,
+            bld,
+            shape,
+            strides,
+            ary.data,
+            tyinp.layout,
+            tyinp.dtype,
+            tyinp.ndim,
+            inp,
+        )
+    elif types.unliteral(tyinp) in types.number_domain | {
+        types.boolean
+    } or isinstance(tyinp, types.scalars._NPDatetimeBase):
+        return _ScalarHelper(ctxt, bld, inp, tyinp)
+    else:
+        raise NotImplementedError(
+            "unsupported type for {0}: {1}".format(where, str(tyinp))
+        )
+
+
+_broadcast_onto_sig = types.intp(
+    types.intp,
+    types.CPointer(types.intp),
+    types.intp,
+    types.CPointer(types.intp),
+)
+
+
+def _broadcast_onto(src_ndim, src_shape, dest_ndim, dest_shape):
+    """Low-level utility function used in calculating a shape for
+    an implicit output array.  This function assumes that the
+    destination shape is an LLVM pointer to a C-style array that was
+    already initialized to a size of one along all axes.
+
+    Returns an integer value:
+    >= 1  :  Succeeded.  Return value should equal the number of dimensions in
+             the destination shape.
+    0     :  Failed to broadcast because source shape is larger than the
+             destination shape (this case should be weeded out at type
+             checking).
+    < 0   :  Failed to broadcast onto destination axis, at axis number ==
+             -(return_value + 1).
+    """
+    if src_ndim > dest_ndim:
+        # This check should have been done during type checking, but
+        # let's be defensive anyway...
+        return 0
+    else:
+        src_index = 0
+        dest_index = dest_ndim - src_ndim
+        while src_index < src_ndim:
+            src_dim_size = src_shape[src_index]
+            dest_dim_size = dest_shape[dest_index]
+            # Check to see if we've already mutated the destination
+            # shape along this axis.
+            if dest_dim_size != 1:
+                # If we have mutated the destination shape already,
+                # then the source axis size must either be one,
+                # or the destination axis size.
+                if src_dim_size != dest_dim_size and src_dim_size != 1:
+                    return -(dest_index + 1)
+            elif src_dim_size != 1:
+                # If the destination size is still its initial
+                dest_shape[dest_index] = src_dim_size
+            src_index += 1
+            dest_index += 1
+    return dest_index
+
+
+def _build_array(context, builder, array_ty, input_types, inputs):
+    """Utility function to handle allocation of an implicit output array
+    given the target context, builder, output array type, and a list of
+    _ArrayHelper instances.
+    """
+    # First, strip optional types, ufunc loops are typed on concrete types
+    input_types = [
+        x.type if isinstance(x, types.Optional) else x for x in input_types
+    ]
+
+    intp_ty = context.get_value_type(types.intp)
+
+    def make_intp_const(val):
+        return context.get_constant(types.intp, val)
+
+    ZERO = make_intp_const(0)  # noqa: F841
+    ONE = make_intp_const(1)
+
+    src_shape = cgutils.alloca_once(
+        builder, intp_ty, array_ty.ndim, "src_shape"
+    )
+    dest_ndim = make_intp_const(array_ty.ndim)
+    dest_shape = cgutils.alloca_once(
+        builder, intp_ty, array_ty.ndim, "dest_shape"
+    )
+    dest_shape_addrs = tuple(
+        cgutils.gep_inbounds(builder, dest_shape, index)
+        for index in range(array_ty.ndim)
+    )
+
+    # Initialize the destination shape with all ones.
+    for dest_shape_addr in dest_shape_addrs:
+        builder.store(ONE, dest_shape_addr)
+
+    # For each argument, try to broadcast onto the destination shape,
+    # mutating along any axis where the argument shape is not one and
+    # the destination shape is one.
+    for arg_number, arg in enumerate(inputs):
+        if not hasattr(arg, "ndim"):  # Skip scalar arguments
+            continue
+        arg_ndim = make_intp_const(arg.ndim)
+        for index in range(arg.ndim):
+            builder.store(
+                arg.shape[index],
+                cgutils.gep_inbounds(builder, src_shape, index),
+            )
+        arg_result = context.compile_internal(
+            builder,
+            _broadcast_onto,
+            _broadcast_onto_sig,
+            [arg_ndim, src_shape, dest_ndim, dest_shape],
+        )
+        with cgutils.if_unlikely(
+            builder, builder.icmp_signed("<", arg_result, ONE)
+        ):
+            msg = "unable to broadcast argument %d to output array" % (
+                arg_number,
+            )
+
+            loc = errors.loc_info.get("loc", None)
+            if loc is not None:
+                msg += '\nFile "%s", line %d, ' % (loc.filename, loc.line)
+
+            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+
+    real_array_ty = array_ty.as_array
+
+    dest_shape_tup = tuple(
+        builder.load(dest_shape_addr) for dest_shape_addr in dest_shape_addrs
+    )
+    array_val = arrayobj._empty_nd_impl(
+        context, builder, real_array_ty, dest_shape_tup
+    )
+
+    # Get the best argument to call __array_wrap__ on
+    array_wrapper_index = select_array_wrapper(input_types)
+    array_wrapper_ty = input_types[array_wrapper_index]
+    try:
+        # __array_wrap__(source wrapped array, out array) -> out wrapped array
+        array_wrap = context.get_function(
+            "__array_wrap__", array_ty(array_wrapper_ty, real_array_ty)
+        )
+    except NotImplementedError:
+        # If it's the same priority as a regular array, assume we
+        # should use the allocated array unchanged.
+        if array_wrapper_ty.array_priority != types.Array.array_priority:
+            raise
+        out_val = array_val._getvalue()
+    else:
+        wrap_args = (
+            inputs[array_wrapper_index].return_val,
+            array_val._getvalue(),
+        )
+        out_val = array_wrap(builder, wrap_args)
+
+    ndim = array_ty.ndim
+    shape = cgutils.unpack_tuple(builder, array_val.shape, ndim)
+    strides = cgutils.unpack_tuple(builder, array_val.strides, ndim)
+    return _ArrayHelper(
+        context,
+        builder,
+        shape,
+        strides,
+        array_val.data,
+        array_ty.layout,
+        array_ty.dtype,
+        ndim,
+        out_val,
+    )
+
+
+# ufuncs either return a single result when nout == 1, else a tuple of results
+
+
+def _unpack_output_types(ufunc, sig):
+    if ufunc.nout == 1:
+        return [sig.return_type]
+    else:
+        return list(sig.return_type)
+
+
+def _unpack_output_values(ufunc, builder, values):
+    if ufunc.nout == 1:
+        return [values]
+    else:
+        return cgutils.unpack_tuple(builder, values)
+
+
+def _pack_output_values(ufunc, context, builder, typ, values):
+    if ufunc.nout == 1:
+        return values[0]
+    else:
+        return context.make_tuple(builder, typ, values)
+
+
+def numpy_ufunc_kernel(context, builder, sig, args, ufunc, kernel_class):
+    # This is the code generator that builds all the looping needed
+    # to execute a numpy functions over several dimensions (including
+    # scalar cases).
+    #
+    # context - the code generation context
+    # builder - the code emitter
+    # sig - signature of the ufunc
+    # args - the args to the ufunc
+    # ufunc - the ufunc itself
+    # kernel_class -  a code generating subclass of _Kernel that provides
+
+    arguments = [
+        _prepare_argument(context, builder, arg, tyarg)
+        for arg, tyarg in zip(args, sig.args)
+    ]
+
+    if len(arguments) < ufunc.nin:
+        raise RuntimeError(
+            "Not enough inputs to {}, expected {} got {}".format(
+                ufunc.__name__, ufunc.nin, len(arguments)
+            )
+        )
+
+    for out_i, ret_ty in enumerate(_unpack_output_types(ufunc, sig)):
+        if ufunc.nin + out_i >= len(arguments):
+            # this out argument is not provided
+            if isinstance(ret_ty, types.ArrayCompatible):
+                output = _build_array(
+                    context, builder, ret_ty, sig.args, arguments
+                )
+            else:
+                output = _prepare_argument(
+                    context,
+                    builder,
+                    ir.Constant(context.get_value_type(ret_ty), None),
+                    ret_ty,
+                )
+            arguments.append(output)
+        elif context.enable_nrt:
+            # Incref the output
+            context.nrt.incref(builder, ret_ty, args[ufunc.nin + out_i])
+
+    inputs = arguments[: ufunc.nin]
+    outputs = arguments[ufunc.nin :]
+    assert len(outputs) == ufunc.nout
+
+    outer_sig = _ufunc_loop_sig(
+        [a.base_type for a in outputs], [a.base_type for a in inputs]
+    )
+    kernel = kernel_class(context, builder, outer_sig)
+    intpty = context.get_value_type(types.intp)
+
+    indices = [inp.create_iter_indices() for inp in inputs]
+
+    # assume outputs are all the same size, which numpy requires
+
+    loopshape = outputs[0].shape
+
+    # count the number of C and F layout arrays, respectively
+    input_layouts = [
+        inp.layout for inp in inputs if isinstance(inp, _ArrayHelper)
+    ]
+    num_c_layout = len([x for x in input_layouts if x == "C"])
+    num_f_layout = len([x for x in input_layouts if x == "F"])
+
+    # Only choose F iteration order if more arrays are in F layout.
+    # Default to C order otherwise.
+    # This is a best effort for performance. NumPy has more fancy logic that
+    # uses array iterators in non-trivial cases.
+    if num_f_layout > num_c_layout:
+        order = "F"
+    else:
+        order = "C"
+
+    with cgutils.loop_nest(
+        builder, loopshape, intp=intpty, order=order
+    ) as loop_indices:
+        vals_in = []
+        for i, (index, arg) in enumerate(zip(indices, inputs)):
+            index.update_indices(loop_indices, i)
+            vals_in.append(arg.load_data(index.as_values()))
+
+        vals_out = _unpack_output_values(
+            ufunc, builder, kernel.generate(*vals_in)
+        )
+        for val_out, output in zip(vals_out, outputs):
+            output.store_data(loop_indices, val_out)
+
+    out = _pack_output_values(
+        ufunc,
+        context,
+        builder,
+        sig.return_type,
+        [o.return_val for o in outputs],
+    )
+    return impl_ret_new_ref(context, builder, sig.return_type, out)
+
+
+def numpy_gufunc_kernel(context, builder, sig, args, ufunc, kernel_class):
+    arguments = []
+    expected_ndims = kernel_class.dufunc.expected_ndims()
+    expected_ndims = expected_ndims[0] + expected_ndims[1]
+    is_input = [True] * ufunc.nin + [False] * ufunc.nout
+    for arg, ty, exp_ndim, is_inp in zip(
+        args, sig.args, expected_ndims, is_input
+    ):  # noqa: E501
+        if isinstance(ty, types.ArrayCompatible):
+            # Create an array helper that iteration returns a subarray
+            # with ndim specified by "exp_ndim"
+            arr = context.make_array(ty)(context, builder, arg)
+            shape = cgutils.unpack_tuple(builder, arr.shape, ty.ndim)
+            strides = cgutils.unpack_tuple(builder, arr.strides, ty.ndim)
+            inner_arr_ty = ty.copy(ndim=exp_ndim)
+            ndim = ty.ndim
+            layout = ty.layout
+            base_type = ty.dtype
+            array_helper = _ArrayGUHelper(
+                context,
+                builder,
+                shape,
+                strides,
+                arg,
+                layout,
+                base_type,
+                ndim,
+                inner_arr_ty,
+                is_inp,
+            )
+            arguments.append(array_helper)
+        else:
+            scalar_helper = _ScalarHelper(context, builder, arg, ty)
+            arguments.append(scalar_helper)
+    kernel = kernel_class(context, builder, sig)
+
+    layouts = [
+        arg.layout for arg in arguments if isinstance(arg, _ArrayGUHelper)
+    ]
+    num_c_layout = len([x for x in layouts if x == "C"])
+    num_f_layout = len([x for x in layouts if x == "F"])
+
+    # Only choose F iteration order if more arrays are in F layout.
+    # Default to C order otherwise.
+    # This is a best effort for performance. NumPy has more fancy logic that
+    # uses array iterators in non-trivial cases.
+    if num_f_layout > num_c_layout:
+        order = "F"
+    else:
+        order = "C"
+
+    outputs = arguments[ufunc.nin :]
+    intpty = context.get_value_type(types.intp)
+    indices = [inp.create_iter_indices() for inp in arguments]
+    loopshape_ndim = outputs[0].ndim - outputs[0].inner_arr_ty.ndim
+    loopshape = outputs[0].shape[:loopshape_ndim]
+
+    _sig = parse_signature(ufunc.gufunc_builder.signature)
+    for (idx_a, sig_a), (idx_b, sig_b) in itertools.combinations(
+        zip(range(len(arguments)), _sig[0] + _sig[1]), r=2
+    ):
+        # For each pair of arguments, both inputs and outputs, must match their
+        # inner dimensions if their signatures are the same.
+        arg_a, arg_b = arguments[idx_a], arguments[idx_b]
+        if sig_a == sig_b and all(
+            isinstance(x, _ArrayGUHelper) for x in (arg_a, arg_b)
+        ):
+            arg_a, arg_b = arguments[idx_a], arguments[idx_b]
+            arg_a.guard_match_core_dims(arg_b, len(sig_a))
+
+    for arg in arguments[: ufunc.nin]:
+        if isinstance(arg, _ArrayGUHelper):
+            arg.guard_shape(loopshape)
+
+    with cgutils.loop_nest(
+        builder, loopshape, intp=intpty, order=order
+    ) as loop_indices:
+        vals_in = []
+        for i, (index, arg) in enumerate(zip(indices, arguments)):
+            index.update_indices(loop_indices, i)
+            vals_in.append(arg.load_data(index.as_values()))
+
+        kernel.generate(*vals_in)
+
+
+# Kernels are the code to be executed inside the multidimensional loop.
+class _Kernel(object):
+    def __init__(self, context, builder, outer_sig):
+        self.context = context
+        self.builder = builder
+        self.outer_sig = outer_sig
+
+    def cast(self, val, fromty, toty):
+        """Numpy uses cast semantics that are different from standard Python
+        (for example, it does allow casting from complex to float).
+
+        This method acts as a patch to context.cast so that it allows
+        complex to real/int casts.
+
+        """
+        if isinstance(fromty, types.Complex) and not isinstance(
+            toty, types.Complex
+        ):
+            # attempt conversion of the real part to the specified type.
+            # note that NumPy issues a warning in this kind of conversions
+            newty = fromty.underlying_float
+            attr = self.context.get_getattr(fromty, "real")
+            val = attr(self.context, self.builder, fromty, val, "real")
+            fromty = newty
+            # let the regular cast do the rest...
+
+        return self.context.cast(self.builder, val, fromty, toty)
+
+    def generate(self, *args):
+        isig = self.inner_sig
+        osig = self.outer_sig
+        cast_args = [
+            self.cast(val, inty, outty)
+            for val, inty, outty in zip(args, osig.args, isig.args)
+        ]
+        if self.cres.objectmode:
+            func_type = self.context.call_conv.get_function_type(
+                types.pyobject, [types.pyobject] * len(isig.args)
+            )
+        else:
+            func_type = self.context.call_conv.get_function_type(
+                isig.return_type, isig.args
+            )
+        module = self.builder.block.function.module
+        entry_point = cgutils.get_or_insert_function(
+            module, func_type, self.cres.fndesc.llvm_func_name
+        )
+        entry_point.attributes.add("alwaysinline")
+
+        _, res = self.context.call_conv.call_function(
+            self.builder, entry_point, isig.return_type, isig.args, cast_args
+        )
+        return self.cast(res, isig.return_type, osig.return_type)
+
+
+def _ufunc_db_function(ufunc):
+    """Use the ufunc loop type information to select the code generation
+    function from the table provided by the dict_of_kernels. The dict
+    of kernels maps the loop identifier to a function with the
+    following signature: (context, builder, signature, args).
+
+    The loop type information has the form 'AB->C'. The letters to the
+    left of '->' are the input types (specified as NumPy letter
+    types).  The letters to the right of '->' are the output
+    types. There must be 'ufunc.nin' letters to the left of '->', and
+    'ufunc.nout' letters to the right.
+
+    For example, a binary float loop resulting in a float, will have
+    the following signature: 'ff->f'.
+
+    A given ufunc implements many loops. The list of loops implemented
+    for a given ufunc can be accessed using the 'types' attribute in
+    the ufunc object. The NumPy machinery selects the first loop that
+    fits a given calling signature (in our case, what we call the
+    outer_sig). This logic is mimicked by 'ufunc_find_matching_loop'.
+    """
+
+    class _KernelImpl(_Kernel):
+        def __init__(self, context, builder, outer_sig):
+            super(_KernelImpl, self).__init__(context, builder, outer_sig)
+            loop = ufunc_find_matching_loop(
+                ufunc,
+                outer_sig.args + tuple(_unpack_output_types(ufunc, outer_sig)),
+            )
+            self.fn = context.get_ufunc_info(ufunc).get(loop.ufunc_sig)
+            self.inner_sig = _ufunc_loop_sig(loop.outputs, loop.inputs)
+
+            if self.fn is None:
+                msg = "Don't know how to lower ufunc '{0}' for loop '{1}'"
+                raise NotImplementedError(msg.format(ufunc.__name__, loop))
+
+        def generate(self, *args):
+            isig = self.inner_sig
+            osig = self.outer_sig
+
+            cast_args = [
+                self.cast(val, inty, outty)
+                for val, inty, outty in zip(args, osig.args, isig.args)
+            ]
+            with force_error_model(self.context, "numpy"):
+                res = self.fn(self.context, self.builder, isig, cast_args)
+            dmm = self.context.data_model_manager
+            res = dmm[isig.return_type].from_return(self.builder, res)
+            return self.cast(res, isig.return_type, osig.return_type)
+
+    return _KernelImpl
+
+
+################################################################################
+# Helper functions that register the ufuncs
+
+
+def register_ufunc_kernel(ufunc, kernel, lower):
+    def do_ufunc(context, builder, sig, args):
+        return numpy_ufunc_kernel(context, builder, sig, args, ufunc, kernel)
+
+    _any = types.Any
+    in_args = (_any,) * ufunc.nin
+
+    # Add a lowering for each out argument that is missing.
+    for n_explicit_out in range(ufunc.nout + 1):
+        out_args = (types.Array,) * n_explicit_out
+        lower(ufunc, *in_args, *out_args)(do_ufunc)
+
+    return kernel
+
+
+def register_unary_operator_kernel(
+    operator, ufunc, kernel, lower, inplace=False
+):
+    assert not inplace  # are there any inplace unary operators?
+
+    def lower_unary_operator(context, builder, sig, args):
+        return numpy_ufunc_kernel(context, builder, sig, args, ufunc, kernel)
+
+    _arr_kind = types.Array
+    lower(operator, _arr_kind)(lower_unary_operator)
+
+
+def register_binary_operator_kernel(op, ufunc, kernel, lower, inplace=False):
+    def lower_binary_operator(context, builder, sig, args):
+        return numpy_ufunc_kernel(context, builder, sig, args, ufunc, kernel)
+
+    def lower_inplace_operator(context, builder, sig, args):
+        # The visible signature is (A, B) -> A
+        # The implementation's signature (with explicit output)
+        # is (A, B, A) -> A
+        args = tuple(args) + (args[0],)
+        sig = typing.signature(sig.return_type, *sig.args + (sig.args[0],))
+        return numpy_ufunc_kernel(context, builder, sig, args, ufunc, kernel)
+
+    _any = types.Any
+    _arr_kind = types.Array
+    formal_sigs = [(_arr_kind, _arr_kind), (_any, _arr_kind), (_arr_kind, _any)]
+    for sig in formal_sigs:
+        if not inplace:
+            lower(op, *sig)(lower_binary_operator)
+        else:
+            lower(op, *sig)(lower_inplace_operator)
+
+
+################################################################################
+# Use the contents of ufunc_db to initialize the supported ufuncs
+
+
+@registry.lower(operator.pos, types.Array)
+def array_positive_impl(context, builder, sig, args):
+    """Lowering function for +(array) expressions.  Defined here
+    (numba.targets.npyimpl) since the remaining array-operator
+    lowering functions are also registered in this module.
+    """
+
+    class _UnaryPositiveKernel(_Kernel):
+        def generate(self, *args):
+            [val] = args
+            return val
+
+    return numpy_ufunc_kernel(
+        context, builder, sig, args, np.positive, _UnaryPositiveKernel
+    )
+
+
+def register_ufuncs(ufuncs, lower):
+    kernels = {}
+    for ufunc in ufuncs:
+        db_func = _ufunc_db_function(ufunc)
+        kernels[ufunc] = register_ufunc_kernel(ufunc, db_func, lower)
+
+    for _op_map in (
+        npydecl.NumpyRulesUnaryArrayOperator._op_map,
+        npydecl.NumpyRulesArrayOperator._op_map,
+    ):
+        for op, ufunc_name in _op_map.items():
+            ufunc = getattr(np, ufunc_name)
+            kernel = kernels[ufunc]
+            if ufunc.nin == 1:
+                register_unary_operator_kernel(op, ufunc, kernel, lower)
+            elif ufunc.nin == 2:
+                register_binary_operator_kernel(op, ufunc, kernel, lower)
+            else:
+                raise RuntimeError(
+                    "There shouldn't be any non-unary or binary operators"
+                )
+
+    for _op_map in (npydecl.NumpyRulesInplaceArrayOperator._op_map,):
+        for op, ufunc_name in _op_map.items():
+            ufunc = getattr(np, ufunc_name)
+            kernel = kernels[ufunc]
+            if ufunc.nin == 1:
+                register_unary_operator_kernel(
+                    op, ufunc, kernel, lower, inplace=True
+                )
+            elif ufunc.nin == 2:
+                register_binary_operator_kernel(
+                    op, ufunc, kernel, lower, inplace=True
+                )
+            else:
+                raise RuntimeError(
+                    "There shouldn't be any non-unary or binary operators"
+                )
+
+
+register_ufuncs(ufunc_db.get_ufuncs(), registry.lower)
+
+
+@intrinsic
+def _make_dtype_object(typingctx, desc):
+    """Given a string or NumberClass description *desc*, returns the dtype object."""
+
+    def from_nb_type(nb_type):
+        return_type = types.DType(nb_type)
+        sig = return_type(desc)
+
+        def codegen(context, builder, signature, args):
+            # All dtype objects are dummy values in LLVM.
+            # They only exist in the type level.
+            return context.get_dummy_value()
+
+        return sig, codegen
+
+    if isinstance(desc, types.Literal):
+        # Convert the str description into np.dtype then to numba type.
+        nb_type = from_dtype(np.dtype(desc.literal_value))
+        return from_nb_type(nb_type)
+    elif isinstance(desc, types.functions.NumberClass):
+        thestr = str(desc.dtype)
+        # Convert the str description into np.dtype then to numba type.
+        nb_type = from_dtype(np.dtype(thestr))
+        return from_nb_type(nb_type)
+
+
+@overload(np.dtype)
+def numpy_dtype(desc):
+    """Provide an implementation so that numpy.dtype function can be lowered."""
+    if isinstance(desc, (types.Literal, types.functions.NumberClass)):
+
+        def imp(desc):
+            return _make_dtype_object(desc)
+
+        return imp
+    else:
+        raise errors.NumbaTypeError("unknown dtype descriptor: {}".format(desc))
diff --git a/numba_cuda/numba/cuda/np/numpy_support.py b/numba_cuda/numba/cuda/np/numpy_support.py
index 4fe3f6f54..93fb4ef07 100644
--- a/numba_cuda/numba/cuda/np/numpy_support.py
+++ b/numba_cuda/numba/cuda/np/numpy_support.py
@@ -4,9 +4,12 @@
 import collections
 import numpy as np
 import re
-
 from numba.core import types, errors
-from numba.cuda.typing.templates import signature
+from numba.cuda.typing import signature
+import ctypes
+from numba.core.errors import TypingError
+
+
 from numba.cuda.np import npdatetime_helpers
 
 numpy_version = tuple(map(int, np.__version__.split(".")[:2]))
@@ -551,3 +554,222 @@ def from_struct_dtype(dtype):
     aligned = _is_aligned_struct(dtype)
 
     return types.Record(fields, size, aligned)
+
+
+def _ufunc_loop_sig(out_tys, in_tys):
+    if len(out_tys) == 1:
+        return signature(out_tys[0], *in_tys)
+    else:
+        return signature(types.Tuple(out_tys), *in_tys)
+
+
+def _get_bytes_buffer(ptr, nbytes):
+    """
+    Get a ctypes array of *nbytes* starting at *ptr*.
+    """
+    if isinstance(ptr, ctypes.c_void_p):
+        ptr = ptr.value
+    arrty = ctypes.c_byte * nbytes
+    return arrty.from_address(ptr)
+
+
+def _get_array_from_ptr(ptr, nbytes, dtype):
+    return np.frombuffer(_get_bytes_buffer(ptr, nbytes), dtype)
+
+
+def carray(ptr, shape, dtype=None):
+    """
+    Return a Numpy array view over the data pointed to by *ptr* with the
+    given *shape*, in C order.  If *dtype* is given, it is used as the
+    array's dtype, otherwise the array's dtype is inferred from *ptr*'s type.
+    """
+    from numba.core.typing.ctypes_utils import from_ctypes
+
+    try:
+        # Use ctypes parameter protocol if available
+        ptr = ptr._as_parameter_
+    except AttributeError:
+        pass
+
+    # Normalize dtype, to accept e.g. "int64" or np.int64
+    if dtype is not None:
+        dtype = np.dtype(dtype)
+
+    if isinstance(ptr, ctypes.c_void_p):
+        if dtype is None:
+            raise TypeError("explicit dtype required for void* argument")
+        p = ptr
+    elif isinstance(ptr, ctypes._Pointer):
+        ptrty = from_ctypes(ptr.__class__)
+        assert isinstance(ptrty, types.CPointer)
+        ptr_dtype = as_dtype(ptrty.dtype)
+        if dtype is not None and dtype != ptr_dtype:
+            raise TypeError(
+                "mismatching dtype '%s' for pointer %s" % (dtype, ptr)
+            )
+        dtype = ptr_dtype
+        p = ctypes.cast(ptr, ctypes.c_void_p)
+    else:
+        raise TypeError("expected a ctypes pointer, got %r" % (ptr,))
+
+    nbytes = dtype.itemsize * np.prod(shape, dtype=np.intp)
+    return _get_array_from_ptr(p, nbytes, dtype).reshape(shape)
+
+
+def farray(ptr, shape, dtype=None):
+    """
+    Return a Numpy array view over the data pointed to by *ptr* with the
+    given *shape*, in Fortran order.  If *dtype* is given, it is used as the
+    array's dtype, otherwise the array's dtype is inferred from *ptr*'s type.
+    """
+    if not isinstance(shape, int):
+        shape = shape[::-1]
+    return carray(ptr, shape, dtype).T
+
+
+def is_contiguous(dims, strides, itemsize):
+    """Is the given shape, strides, and itemsize of C layout?
+
+    Note: The code is usable as a numba-compiled function
+    """
+    nd = len(dims)
+    # Check and skip 1s or 0s in inner dims
+    innerax = nd - 1
+    while innerax > -1 and dims[innerax] <= 1:
+        innerax -= 1
+
+    # Early exit if all axis are 1s or 0s
+    if innerax < 0:
+        return True
+
+    # Check itemsize matches innermost stride
+    if itemsize != strides[innerax]:
+        return False
+
+    # Check and skip 1s or 0s in outer dims
+    outerax = 0
+    while outerax < innerax and dims[outerax] <= 1:
+        outerax += 1
+
+    # Check remaining strides to be contiguous
+    ax = innerax
+    while ax > outerax:
+        if strides[ax] * dims[ax] != strides[ax - 1]:
+            return False
+        ax -= 1
+    return True
+
+
+def is_fortran(dims, strides, itemsize):
+    """Is the given shape, strides, and itemsize of F layout?
+
+    Note: The code is usable as a numba-compiled function
+    """
+    nd = len(dims)
+    # Check and skip 1s or 0s in inner dims
+    firstax = 0
+    while firstax < nd and dims[firstax] <= 1:
+        firstax += 1
+
+    # Early exit if all axis are 1s or 0s
+    if firstax >= nd:
+        return True
+
+    # Check itemsize matches innermost stride
+    if itemsize != strides[firstax]:
+        return False
+
+    # Check and skip 1s or 0s in outer dims
+    lastax = nd - 1
+    while lastax > firstax and dims[lastax] <= 1:
+        lastax -= 1
+
+    # Check remaining strides to be contiguous
+    ax = firstax
+    while ax < lastax:
+        if strides[ax] * dims[ax] != strides[ax + 1]:
+            return False
+        ax += 1
+    return True
+
+
+def type_can_asarray(arr):
+    """Returns True if the type of 'arr' is supported by the Numba `np.asarray`
+    implementation, False otherwise.
+    """
+
+    ok = (
+        types.Array,
+        types.Sequence,
+        types.Tuple,
+        types.StringLiteral,
+        types.Number,
+        types.Boolean,
+        types.containers.ListType,
+    )
+
+    return isinstance(arr, ok)
+
+
+def type_is_scalar(typ):
+    """Returns True if the type of 'typ' is a scalar type, according to
+    NumPy rules. False otherwise.
+    https://numpy.org/doc/stable/reference/arrays.scalars.html#built-in-scalar-types
+    """
+
+    ok = (
+        types.Boolean,
+        types.Number,
+        types.UnicodeType,
+        types.StringLiteral,
+        types.NPTimedelta,
+        types.NPDatetime,
+    )
+    return isinstance(typ, ok)
+
+
+def check_is_integer(v, name):
+    """Raises TypingError if the value is not an integer."""
+    if not isinstance(v, (int, types.Integer)):
+        raise TypingError("{} must be an integer".format(name))
+
+
+def lt_floats(a, b):
+    # Adapted from NumPy commit 717c7acf which introduced the behavior of
+    # putting NaNs at the end.
+    # The code is later moved to numpy/core/src/npysort/npysort_common.h
+    # This info is gathered as of NumPy commit d8c09c50
+    return a < b or (np.isnan(b) and not np.isnan(a))
+
+
+def lt_complex(a, b):
+    if np.isnan(a.real):
+        if np.isnan(b.real):
+            if np.isnan(a.imag):
+                return False
+            else:
+                if np.isnan(b.imag):
+                    return True
+                else:
+                    return a.imag < b.imag
+        else:
+            return False
+
+    else:
+        if np.isnan(b.real):
+            return True
+        else:
+            if np.isnan(a.imag):
+                if np.isnan(b.imag):
+                    return a.real < b.real
+                else:
+                    return False
+            else:
+                if np.isnan(b.imag):
+                    return True
+                else:
+                    if a.real < b.real:
+                        return True
+                    elif a.real == b.real:
+                        return a.imag < b.imag
+                    return False
diff --git a/numba_cuda/numba/cuda/np/ufunc/sigparse.py b/numba_cuda/numba/cuda/np/ufunc/sigparse.py
new file mode 100644
index 000000000..89831c664
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/ufunc/sigparse.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+import tokenize
+import string
+
+
+def parse_signature(sig):
+    """Parse generalized ufunc signature.
+
+    NOTE: ',' (COMMA) is a delimiter; not separator.
+          This means trailing comma is legal.
+    """
+
+    def stripws(s):
+        return "".join(c for c in s if c not in string.whitespace)
+
+    def tokenizer(src):
+        def readline():
+            yield src
+
+        gen = readline()
+        return tokenize.generate_tokens(lambda: next(gen))
+
+    def parse(src):
+        tokgen = tokenizer(src)
+        while True:
+            tok = next(tokgen)
+            if tok[1] == "(":
+                symbols = []
+                while True:
+                    tok = next(tokgen)
+                    if tok[1] == ")":
+                        break
+                    elif tok[0] == tokenize.NAME:
+                        symbols.append(tok[1])
+                    elif tok[1] == ",":
+                        continue
+                    else:
+                        raise ValueError('bad token in signature "%s"' % tok[1])
+                yield tuple(symbols)
+                tok = next(tokgen)
+                if tok[1] == ",":
+                    continue
+                elif tokenize.ISEOF(tok[0]):
+                    break
+            elif tokenize.ISEOF(tok[0]):
+                break
+            else:
+                raise ValueError('bad token in signature "%s"' % tok[1])
+
+    ins, _, outs = stripws(sig).partition("->")
+    inputs = list(parse(ins))
+    outputs = list(parse(outs))
+
+    # check that all output symbols are defined in the inputs
+    isym = set()
+    osym = set()
+    for grp in inputs:
+        isym |= set(grp)
+    for grp in outputs:
+        osym |= set(grp)
+
+    diff = osym.difference(isym)
+    if diff:
+        raise NameError("undefined output symbols: %s" % ",".join(sorted(diff)))
+
+    return inputs, outputs
diff --git a/numba_cuda/numba/cuda/np/ufunc_db.py b/numba_cuda/numba/cuda/np/ufunc_db.py
new file mode 100644
index 000000000..e19ae59f8
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/ufunc_db.py
@@ -0,0 +1,1282 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""This file contains information on how to translate different ufuncs
+into numba. It is a database of different ufuncs and how each of its
+loops maps to a function that implements the inner kernel of that ufunc
+(the inner kernel being the per-element function).
+
+Use the function get_ufunc_info to get the information related to the
+ufunc
+"""
+
+import numpy as np
+import sys
+
+# this is lazily initialized to avoid circular imports
+IS_WIN32 = sys.platform.startswith("win32")
+numpy_version = tuple(map(int, np.__version__.split(".")[:2]))
+_ufunc_db = None
+
+
+def _lazy_init_db():
+    global _ufunc_db
+
+    if _ufunc_db is None:
+        _ufunc_db = {}
+        _fill_ufunc_db(_ufunc_db)
+
+
+def get_ufuncs():
+    """obtain a list of supported ufuncs in the db"""
+    _lazy_init_db()
+    return _ufunc_db.keys()
+
+
+def get_ufunc_info(ufunc_key):
+    """get the lowering information for the ufunc with key ufunc_key.
+
+    The lowering information is a dictionary that maps from a numpy
+    loop string (as given by the ufunc types attribute) to a function
+    that handles code generation for a scalar version of the ufunc
+    (that is, generates the "per element" operation").
+
+    raises a KeyError if the ufunc is not in the ufunc_db
+    """
+    _lazy_init_db()
+    return _ufunc_db[ufunc_key]
+
+
+def _fill_ufunc_db(ufunc_db):
+    # some of these imports would cause a problem of circular
+    # imports if done at global scope when importing the numba
+    # module.
+    from numba.cuda.np import npyfuncs
+    from numba.cuda.np.math import cmathimpl, mathimpl, numbers
+    from numba.cuda.np.numpy_support import numpy_version
+
+    ufunc_db[np.isnat] = {
+        # datetime & timedelta
+        "M->?": npyfuncs.np_datetime_isnat_impl,
+        "m->?": npyfuncs.np_datetime_isnat_impl,
+    }
+
+    ufunc_db[np.negative] = {
+        "?->?": numbers.int_invert_impl,
+        "b->b": numbers.int_negate_impl,
+        "B->B": numbers.int_negate_impl,
+        "h->h": numbers.int_negate_impl,
+        "H->H": numbers.int_negate_impl,
+        "i->i": numbers.int_negate_impl,
+        "I->I": numbers.int_negate_impl,
+        "l->l": numbers.int_negate_impl,
+        "L->L": numbers.int_negate_impl,
+        "q->q": numbers.int_negate_impl,
+        "Q->Q": numbers.int_negate_impl,
+        "f->f": numbers.real_negate_impl,
+        "d->d": numbers.real_negate_impl,
+        "F->F": numbers.complex_negate_impl,
+        "D->D": numbers.complex_negate_impl,
+    }
+
+    ufunc_db[np.positive] = {
+        "?->?": numbers.int_positive_impl,
+        "b->b": numbers.int_positive_impl,
+        "B->B": numbers.int_positive_impl,
+        "h->h": numbers.int_positive_impl,
+        "H->H": numbers.int_positive_impl,
+        "i->i": numbers.int_positive_impl,
+        "I->I": numbers.int_positive_impl,
+        "l->l": numbers.int_positive_impl,
+        "L->L": numbers.int_positive_impl,
+        "q->q": numbers.int_positive_impl,
+        "Q->Q": numbers.int_positive_impl,
+        "f->f": numbers.real_positive_impl,
+        "d->d": numbers.real_positive_impl,
+        "F->F": numbers.complex_positive_impl,
+        "D->D": numbers.complex_positive_impl,
+    }
+
+    ufunc_db[np.absolute] = {
+        "?->?": numbers.int_abs_impl,
+        "b->b": numbers.int_abs_impl,
+        "B->B": numbers.uint_abs_impl,
+        "h->h": numbers.int_abs_impl,
+        "H->H": numbers.uint_abs_impl,
+        "i->i": numbers.int_abs_impl,
+        "I->I": numbers.uint_abs_impl,
+        "l->l": numbers.int_abs_impl,
+        "L->L": numbers.uint_abs_impl,
+        "q->q": numbers.int_abs_impl,
+        "Q->Q": numbers.uint_abs_impl,
+        "f->f": numbers.real_abs_impl,
+        "d->d": numbers.real_abs_impl,
+        "F->f": numbers.complex_abs_impl,
+        "D->d": numbers.complex_abs_impl,
+    }
+
+    ufunc_db[np.sign] = {
+        "b->b": numbers.int_sign_impl,
+        "B->B": numbers.int_sign_impl,
+        "h->h": numbers.int_sign_impl,
+        "H->H": numbers.int_sign_impl,
+        "i->i": numbers.int_sign_impl,
+        "I->I": numbers.int_sign_impl,
+        "l->l": numbers.int_sign_impl,
+        "L->L": numbers.int_sign_impl,
+        "q->q": numbers.int_sign_impl,
+        "Q->Q": numbers.int_sign_impl,
+        "f->f": numbers.real_sign_impl,
+        "d->d": numbers.real_sign_impl,
+        "F->F": npyfuncs.np_complex_sign_impl,
+        "D->D": npyfuncs.np_complex_sign_impl,
+    }
+
+    ufunc_db[np.add] = {
+        "??->?": numbers.int_or_impl,
+        "bb->b": numbers.int_add_impl,
+        "BB->B": numbers.int_add_impl,
+        "hh->h": numbers.int_add_impl,
+        "HH->H": numbers.int_add_impl,
+        "ii->i": numbers.int_add_impl,
+        "II->I": numbers.int_add_impl,
+        "ll->l": numbers.int_add_impl,
+        "LL->L": numbers.int_add_impl,
+        "qq->q": numbers.int_add_impl,
+        "QQ->Q": numbers.int_add_impl,
+        "ff->f": numbers.real_add_impl,
+        "dd->d": numbers.real_add_impl,
+        "FF->F": numbers.complex_add_impl,
+        "DD->D": numbers.complex_add_impl,
+    }
+
+    ufunc_db[np.subtract] = {
+        "??->?": numbers.int_xor_impl,
+        "bb->b": numbers.int_sub_impl,
+        "BB->B": numbers.int_sub_impl,
+        "hh->h": numbers.int_sub_impl,
+        "HH->H": numbers.int_sub_impl,
+        "ii->i": numbers.int_sub_impl,
+        "II->I": numbers.int_sub_impl,
+        "ll->l": numbers.int_sub_impl,
+        "LL->L": numbers.int_sub_impl,
+        "qq->q": numbers.int_sub_impl,
+        "QQ->Q": numbers.int_sub_impl,
+        "ff->f": numbers.real_sub_impl,
+        "dd->d": numbers.real_sub_impl,
+        "FF->F": numbers.complex_sub_impl,
+        "DD->D": numbers.complex_sub_impl,
+    }
+
+    ufunc_db[np.multiply] = {
+        "??->?": numbers.int_and_impl,
+        "bb->b": numbers.int_mul_impl,
+        "BB->B": numbers.int_mul_impl,
+        "hh->h": numbers.int_mul_impl,
+        "HH->H": numbers.int_mul_impl,
+        "ii->i": numbers.int_mul_impl,
+        "II->I": numbers.int_mul_impl,
+        "ll->l": numbers.int_mul_impl,
+        "LL->L": numbers.int_mul_impl,
+        "qq->q": numbers.int_mul_impl,
+        "QQ->Q": numbers.int_mul_impl,
+        "ff->f": numbers.real_mul_impl,
+        "dd->d": numbers.real_mul_impl,
+        "FF->F": numbers.complex_mul_impl,
+        "DD->D": numbers.complex_mul_impl,
+    }
+
+    if np.divide != np.true_divide:
+        ufunc_db[np.divide] = {
+            "bb->b": npyfuncs.np_int_sdiv_impl,
+            "BB->B": npyfuncs.np_int_udiv_impl,
+            "hh->h": npyfuncs.np_int_sdiv_impl,
+            "HH->H": npyfuncs.np_int_udiv_impl,
+            "ii->i": npyfuncs.np_int_sdiv_impl,
+            "II->I": npyfuncs.np_int_udiv_impl,
+            "ll->l": npyfuncs.np_int_sdiv_impl,
+            "LL->L": npyfuncs.np_int_udiv_impl,
+            "qq->q": npyfuncs.np_int_sdiv_impl,
+            "QQ->Q": npyfuncs.np_int_udiv_impl,
+            "ff->f": npyfuncs.np_real_div_impl,
+            "dd->d": npyfuncs.np_real_div_impl,
+            "FF->F": npyfuncs.np_complex_div_impl,
+            "DD->D": npyfuncs.np_complex_div_impl,
+        }
+
+    ufunc_db[np.true_divide] = {
+        "bb->d": npyfuncs.np_int_truediv_impl,
+        "BB->d": npyfuncs.np_int_truediv_impl,
+        "hh->d": npyfuncs.np_int_truediv_impl,
+        "HH->d": npyfuncs.np_int_truediv_impl,
+        "ii->d": npyfuncs.np_int_truediv_impl,
+        "II->d": npyfuncs.np_int_truediv_impl,
+        "ll->d": npyfuncs.np_int_truediv_impl,
+        "LL->d": npyfuncs.np_int_truediv_impl,
+        "qq->d": npyfuncs.np_int_truediv_impl,
+        "QQ->d": npyfuncs.np_int_truediv_impl,
+        "ff->f": npyfuncs.np_real_div_impl,
+        "dd->d": npyfuncs.np_real_div_impl,
+        "FF->F": npyfuncs.np_complex_div_impl,
+        "DD->D": npyfuncs.np_complex_div_impl,
+    }
+
+    ufunc_db[np.floor_divide] = {
+        "bb->b": npyfuncs.np_int_sdiv_impl,
+        "BB->B": npyfuncs.np_int_udiv_impl,
+        "hh->h": npyfuncs.np_int_sdiv_impl,
+        "HH->H": npyfuncs.np_int_udiv_impl,
+        "ii->i": npyfuncs.np_int_sdiv_impl,
+        "II->I": npyfuncs.np_int_udiv_impl,
+        "ll->l": npyfuncs.np_int_sdiv_impl,
+        "LL->L": npyfuncs.np_int_udiv_impl,
+        "qq->q": npyfuncs.np_int_sdiv_impl,
+        "QQ->Q": npyfuncs.np_int_udiv_impl,
+        "ff->f": npyfuncs.np_real_floor_div_impl,
+        "dd->d": npyfuncs.np_real_floor_div_impl,
+    }
+
+    ufunc_db[np.remainder] = {
+        "bb->b": npyfuncs.np_int_srem_impl,
+        "BB->B": npyfuncs.np_int_urem_impl,
+        "hh->h": npyfuncs.np_int_srem_impl,
+        "HH->H": npyfuncs.np_int_urem_impl,
+        "ii->i": npyfuncs.np_int_srem_impl,
+        "II->I": npyfuncs.np_int_urem_impl,
+        "ll->l": npyfuncs.np_int_srem_impl,
+        "LL->L": npyfuncs.np_int_urem_impl,
+        "qq->q": npyfuncs.np_int_srem_impl,
+        "QQ->Q": npyfuncs.np_int_urem_impl,
+        "ff->f": npyfuncs.np_real_mod_impl,
+        "dd->d": npyfuncs.np_real_mod_impl,
+    }
+
+    ufunc_db[np.divmod] = {
+        "bb->bb": npyfuncs.np_int_sdivrem_impl,
+        "BB->BB": npyfuncs.np_int_udivrem_impl,
+        "hh->hh": npyfuncs.np_int_sdivrem_impl,
+        "HH->HH": npyfuncs.np_int_udivrem_impl,
+        "ii->ii": npyfuncs.np_int_sdivrem_impl,
+        "II->II": npyfuncs.np_int_udivrem_impl,
+        "ll->ll": npyfuncs.np_int_sdivrem_impl,
+        "LL->LL": npyfuncs.np_int_udivrem_impl,
+        "qq->qq": npyfuncs.np_int_sdivrem_impl,
+        "QQ->QQ": npyfuncs.np_int_udivrem_impl,
+        "ff->ff": npyfuncs.np_real_divmod_impl,
+        "dd->dd": npyfuncs.np_real_divmod_impl,
+    }
+
+    ufunc_db[np.fmod] = {
+        "bb->b": npyfuncs.np_int_fmod_impl,
+        "BB->B": npyfuncs.np_int_fmod_impl,
+        "hh->h": npyfuncs.np_int_fmod_impl,
+        "HH->H": npyfuncs.np_int_fmod_impl,
+        "ii->i": npyfuncs.np_int_fmod_impl,
+        "II->I": npyfuncs.np_int_fmod_impl,
+        "ll->l": npyfuncs.np_int_fmod_impl,
+        "LL->L": npyfuncs.np_int_fmod_impl,
+        "qq->q": npyfuncs.np_int_fmod_impl,
+        "QQ->Q": npyfuncs.np_int_fmod_impl,
+        "ff->f": npyfuncs.np_real_fmod_impl,
+        "dd->d": npyfuncs.np_real_fmod_impl,
+    }
+
+    ufunc_db[np.logaddexp] = {
+        "ff->f": npyfuncs.np_real_logaddexp_impl,
+        "dd->d": npyfuncs.np_real_logaddexp_impl,
+    }
+
+    ufunc_db[np.logaddexp2] = {
+        "ff->f": npyfuncs.np_real_logaddexp2_impl,
+        "dd->d": npyfuncs.np_real_logaddexp2_impl,
+    }
+
+    ufunc_db[np.power] = {
+        "bb->b": numbers.int_power_impl,
+        "BB->B": numbers.int_power_impl,
+        "hh->h": numbers.int_power_impl,
+        "HH->H": numbers.int_power_impl,
+        "ii->i": numbers.int_power_impl,
+        "II->I": numbers.int_power_impl,
+        "ll->l": numbers.int_power_impl,
+        "LL->L": numbers.int_power_impl,
+        "qq->q": numbers.int_power_impl,
+        "QQ->Q": numbers.int_power_impl,
+        # XXX we would like to use `int_power_impl` for real ** integer
+        # as well (for better performance), but the current ufunc typing
+        # rules forbid that
+        "ff->f": numbers.real_power_impl,
+        "dd->d": numbers.real_power_impl,
+        "FF->F": npyfuncs.np_complex_power_impl,
+        "DD->D": npyfuncs.np_complex_power_impl,
+    }
+
+    ufunc_db[np.float_power] = {
+        "ff->f": npyfuncs.real_float_power_impl,
+        "dd->d": npyfuncs.real_float_power_impl,
+        "FF->F": npyfuncs.np_complex_float_power_impl,
+        "DD->D": npyfuncs.np_complex_float_power_impl,
+    }
+
+    ufunc_db[np.gcd] = {
+        "bb->b": npyfuncs.np_gcd_impl,
+        "BB->B": npyfuncs.np_gcd_impl,
+        "hh->h": npyfuncs.np_gcd_impl,
+        "HH->H": npyfuncs.np_gcd_impl,
+        "ii->i": npyfuncs.np_gcd_impl,
+        "II->I": npyfuncs.np_gcd_impl,
+        "ll->l": npyfuncs.np_gcd_impl,
+        "LL->L": npyfuncs.np_gcd_impl,
+        "qq->q": npyfuncs.np_gcd_impl,
+        "QQ->Q": npyfuncs.np_gcd_impl,
+    }
+
+    ufunc_db[np.lcm] = {
+        "bb->b": npyfuncs.np_lcm_impl,
+        "BB->B": npyfuncs.np_lcm_impl,
+        "hh->h": npyfuncs.np_lcm_impl,
+        "HH->H": npyfuncs.np_lcm_impl,
+        "ii->i": npyfuncs.np_lcm_impl,
+        "II->I": npyfuncs.np_lcm_impl,
+        "ll->l": npyfuncs.np_lcm_impl,
+        "LL->L": npyfuncs.np_lcm_impl,
+        "qq->q": npyfuncs.np_lcm_impl,
+        "QQ->Q": npyfuncs.np_lcm_impl,
+    }
+
+    ufunc_db[np.rint] = {
+        "f->f": npyfuncs.np_real_rint_impl,
+        "d->d": npyfuncs.np_real_rint_impl,
+        "F->F": npyfuncs.np_complex_rint_impl,
+        "D->D": npyfuncs.np_complex_rint_impl,
+    }
+
+    ufunc_db[np.conjugate] = {
+        "b->b": numbers.real_conjugate_impl,
+        "B->B": numbers.real_conjugate_impl,
+        "h->h": numbers.real_conjugate_impl,
+        "H->H": numbers.real_conjugate_impl,
+        "i->i": numbers.real_conjugate_impl,
+        "I->I": numbers.real_conjugate_impl,
+        "l->l": numbers.real_conjugate_impl,
+        "L->L": numbers.real_conjugate_impl,
+        "q->q": numbers.real_conjugate_impl,
+        "Q->Q": numbers.real_conjugate_impl,
+        "f->f": numbers.real_conjugate_impl,
+        "d->d": numbers.real_conjugate_impl,
+        "F->F": numbers.complex_conjugate_impl,
+        "D->D": numbers.complex_conjugate_impl,
+    }
+
+    ufunc_db[np.exp] = {
+        "f->f": npyfuncs.np_real_exp_impl,
+        "d->d": npyfuncs.np_real_exp_impl,
+        "F->F": npyfuncs.np_complex_exp_impl,
+        "D->D": npyfuncs.np_complex_exp_impl,
+    }
+
+    ufunc_db[np.exp2] = {
+        "f->f": npyfuncs.np_real_exp2_impl,
+        "d->d": npyfuncs.np_real_exp2_impl,
+        "F->F": npyfuncs.np_complex_exp2_impl,
+        "D->D": npyfuncs.np_complex_exp2_impl,
+    }
+
+    ufunc_db[np.log] = {
+        "f->f": npyfuncs.np_real_log_impl,
+        "d->d": npyfuncs.np_real_log_impl,
+        "F->F": npyfuncs.np_complex_log_impl,
+        "D->D": npyfuncs.np_complex_log_impl,
+    }
+
+    ufunc_db[np.log2] = {
+        "f->f": npyfuncs.np_real_log2_impl,
+        "d->d": npyfuncs.np_real_log2_impl,
+        "F->F": npyfuncs.np_complex_log2_impl,
+        "D->D": npyfuncs.np_complex_log2_impl,
+    }
+
+    ufunc_db[np.log10] = {
+        "f->f": npyfuncs.np_real_log10_impl,
+        "d->d": npyfuncs.np_real_log10_impl,
+        "F->F": npyfuncs.np_complex_log10_impl,
+        "D->D": npyfuncs.np_complex_log10_impl,
+    }
+
+    ufunc_db[np.expm1] = {
+        "f->f": npyfuncs.np_real_expm1_impl,
+        "d->d": npyfuncs.np_real_expm1_impl,
+        "F->F": npyfuncs.np_complex_expm1_impl,
+        "D->D": npyfuncs.np_complex_expm1_impl,
+    }
+
+    ufunc_db[np.log1p] = {
+        "f->f": npyfuncs.np_real_log1p_impl,
+        "d->d": npyfuncs.np_real_log1p_impl,
+        "F->F": npyfuncs.np_complex_log1p_impl,
+        "D->D": npyfuncs.np_complex_log1p_impl,
+    }
+
+    ufunc_db[np.sqrt] = {
+        "f->f": npyfuncs.np_real_sqrt_impl,
+        "d->d": npyfuncs.np_real_sqrt_impl,
+        "F->F": npyfuncs.np_complex_sqrt_impl,
+        "D->D": npyfuncs.np_complex_sqrt_impl,
+    }
+
+    ufunc_db[np.square] = {
+        "b->b": npyfuncs.np_int_square_impl,
+        "B->B": npyfuncs.np_int_square_impl,
+        "h->h": npyfuncs.np_int_square_impl,
+        "H->H": npyfuncs.np_int_square_impl,
+        "i->i": npyfuncs.np_int_square_impl,
+        "I->I": npyfuncs.np_int_square_impl,
+        "l->l": npyfuncs.np_int_square_impl,
+        "L->L": npyfuncs.np_int_square_impl,
+        "q->q": npyfuncs.np_int_square_impl,
+        "Q->Q": npyfuncs.np_int_square_impl,
+        "f->f": npyfuncs.np_real_square_impl,
+        "d->d": npyfuncs.np_real_square_impl,
+        "F->F": npyfuncs.np_complex_square_impl,
+        "D->D": npyfuncs.np_complex_square_impl,
+    }
+
+    ufunc_db[np.cbrt] = {
+        "f->f": npyfuncs.np_real_cbrt_impl,
+        "d->d": npyfuncs.np_real_cbrt_impl,
+    }
+
+    ufunc_db[np.reciprocal] = {
+        "b->b": npyfuncs.np_int_reciprocal_impl,
+        "B->B": npyfuncs.np_int_reciprocal_impl,
+        "h->h": npyfuncs.np_int_reciprocal_impl,
+        "H->H": npyfuncs.np_int_reciprocal_impl,
+        "i->i": npyfuncs.np_int_reciprocal_impl,
+        "I->I": npyfuncs.np_int_reciprocal_impl,
+        "l->l": npyfuncs.np_int_reciprocal_impl,
+        "L->L": npyfuncs.np_int_reciprocal_impl,
+        "q->q": npyfuncs.np_int_reciprocal_impl,
+        "Q->Q": npyfuncs.np_int_reciprocal_impl,
+        "f->f": npyfuncs.np_real_reciprocal_impl,
+        "d->d": npyfuncs.np_real_reciprocal_impl,
+        "F->F": npyfuncs.np_complex_reciprocal_impl,
+        "D->D": npyfuncs.np_complex_reciprocal_impl,
+    }
+
+    ufunc_db[np.sin] = {
+        "f->f": npyfuncs.np_real_sin_impl,
+        "d->d": npyfuncs.np_real_sin_impl,
+        "F->F": npyfuncs.np_complex_sin_impl,
+        "D->D": npyfuncs.np_complex_sin_impl,
+    }
+
+    ufunc_db[np.cos] = {
+        "f->f": npyfuncs.np_real_cos_impl,
+        "d->d": npyfuncs.np_real_cos_impl,
+        "F->F": npyfuncs.np_complex_cos_impl,
+        "D->D": npyfuncs.np_complex_cos_impl,
+    }
+
+    tan_impl = cmathimpl.tan_impl
+
+    ufunc_db[np.tan] = {
+        "f->f": npyfuncs.np_real_tan_impl,
+        "d->d": npyfuncs.np_real_tan_impl,
+        "F->F": tan_impl,
+        "D->D": tan_impl,
+    }
+
+    arcsin_impl = cmathimpl.asin_impl
+
+    ufunc_db[np.arcsin] = {
+        "f->f": npyfuncs.np_real_asin_impl,
+        "d->d": npyfuncs.np_real_asin_impl,
+        "F->F": arcsin_impl,
+        "D->D": arcsin_impl,
+    }
+
+    ufunc_db[np.arccos] = {
+        "f->f": npyfuncs.np_real_acos_impl,
+        "d->d": npyfuncs.np_real_acos_impl,
+        "F->F": cmathimpl.acos_impl,
+        "D->D": cmathimpl.acos_impl,
+    }
+
+    arctan_impl = cmathimpl.atan_impl
+
+    ufunc_db[np.arctan] = {
+        "f->f": npyfuncs.np_real_atan_impl,
+        "d->d": npyfuncs.np_real_atan_impl,
+        "F->F": arctan_impl,
+        "D->D": arctan_impl,
+    }
+
+    ufunc_db[np.arctan2] = {
+        "ff->f": npyfuncs.np_real_atan2_impl,
+        "dd->d": npyfuncs.np_real_atan2_impl,
+    }
+
+    ufunc_db[np.hypot] = {
+        "ff->f": npyfuncs.np_real_hypot_impl,
+        "dd->d": npyfuncs.np_real_hypot_impl,
+    }
+
+    ufunc_db[np.sinh] = {
+        "f->f": npyfuncs.np_real_sinh_impl,
+        "d->d": npyfuncs.np_real_sinh_impl,
+        "F->F": npyfuncs.np_complex_sinh_impl,
+        "D->D": npyfuncs.np_complex_sinh_impl,
+    }
+
+    ufunc_db[np.cosh] = {
+        "f->f": npyfuncs.np_real_cosh_impl,
+        "d->d": npyfuncs.np_real_cosh_impl,
+        "F->F": npyfuncs.np_complex_cosh_impl,
+        "D->D": npyfuncs.np_complex_cosh_impl,
+    }
+
+    ufunc_db[np.tanh] = {
+        "f->f": npyfuncs.np_real_tanh_impl,
+        "d->d": npyfuncs.np_real_tanh_impl,
+        "F->F": npyfuncs.np_complex_tanh_impl,
+        "D->D": npyfuncs.np_complex_tanh_impl,
+    }
+
+    arcsinh_impl = cmathimpl.asinh_impl
+
+    ufunc_db[np.arcsinh] = {
+        "f->f": npyfuncs.np_real_asinh_impl,
+        "d->d": npyfuncs.np_real_asinh_impl,
+        "F->F": arcsinh_impl,
+        "D->D": arcsinh_impl,
+    }
+
+    ufunc_db[np.arccosh] = {
+        "f->f": npyfuncs.np_real_acosh_impl,
+        "d->d": npyfuncs.np_real_acosh_impl,
+        "F->F": npyfuncs.np_complex_acosh_impl,
+        "D->D": npyfuncs.np_complex_acosh_impl,
+    }
+
+    arctanh_impl = cmathimpl.atanh_impl
+
+    ufunc_db[np.arctanh] = {
+        "f->f": npyfuncs.np_real_atanh_impl,
+        "d->d": npyfuncs.np_real_atanh_impl,
+        "F->F": arctanh_impl,
+        "D->D": arctanh_impl,
+    }
+
+    ufunc_db[np.deg2rad] = {
+        "f->f": mathimpl.radians_float_impl,
+        "d->d": mathimpl.radians_float_impl,
+    }
+
+    ufunc_db[np.radians] = ufunc_db[np.deg2rad]
+
+    ufunc_db[np.rad2deg] = {
+        "f->f": mathimpl.degrees_float_impl,
+        "d->d": mathimpl.degrees_float_impl,
+    }
+
+    ufunc_db[np.degrees] = ufunc_db[np.rad2deg]
+
+    ufunc_db[np.floor] = {
+        "f->f": npyfuncs.np_real_floor_impl,
+        "d->d": npyfuncs.np_real_floor_impl,
+    }
+    if numpy_version >= (2, 1):
+        ufunc_db[np.floor].update(
+            {
+                "?->?": numbers.identity_impl,
+                "b->b": numbers.identity_impl,
+                "B->B": numbers.identity_impl,
+                "h->h": numbers.identity_impl,
+                "H->H": numbers.identity_impl,
+                "i->i": numbers.identity_impl,
+                "I->I": numbers.identity_impl,
+                "l->l": numbers.identity_impl,
+                "L->L": numbers.identity_impl,
+                "q->q": numbers.identity_impl,
+                "Q->Q": numbers.identity_impl,
+            }
+        )
+
+    ufunc_db[np.ceil] = {
+        "f->f": npyfuncs.np_real_ceil_impl,
+        "d->d": npyfuncs.np_real_ceil_impl,
+    }
+    if numpy_version >= (2, 1):
+        ufunc_db[np.ceil].update(
+            {
+                "?->?": numbers.identity_impl,
+                "b->b": numbers.identity_impl,
+                "B->B": numbers.identity_impl,
+                "h->h": numbers.identity_impl,
+                "H->H": numbers.identity_impl,
+                "i->i": numbers.identity_impl,
+                "I->I": numbers.identity_impl,
+                "l->l": numbers.identity_impl,
+                "L->L": numbers.identity_impl,
+                "q->q": numbers.identity_impl,
+                "Q->Q": numbers.identity_impl,
+            }
+        )
+
+    ufunc_db[np.trunc] = {
+        "f->f": npyfuncs.np_real_trunc_impl,
+        "d->d": npyfuncs.np_real_trunc_impl,
+    }
+    if numpy_version >= (2, 1):
+        ufunc_db[np.trunc].update(
+            {
+                "?->?": numbers.identity_impl,
+                "b->b": numbers.identity_impl,
+                "B->B": numbers.identity_impl,
+                "h->h": numbers.identity_impl,
+                "H->H": numbers.identity_impl,
+                "i->i": numbers.identity_impl,
+                "I->I": numbers.identity_impl,
+                "l->l": numbers.identity_impl,
+                "L->L": numbers.identity_impl,
+                "q->q": numbers.identity_impl,
+                "Q->Q": numbers.identity_impl,
+            }
+        )
+
+    ufunc_db[np.fabs] = {
+        "f->f": npyfuncs.np_real_fabs_impl,
+        "d->d": npyfuncs.np_real_fabs_impl,
+    }
+
+    # logical ufuncs
+    ufunc_db[np.greater] = {
+        "??->?": numbers.int_ugt_impl,
+        "bb->?": numbers.int_sgt_impl,
+        "BB->?": numbers.int_ugt_impl,
+        "hh->?": numbers.int_sgt_impl,
+        "HH->?": numbers.int_ugt_impl,
+        "ii->?": numbers.int_sgt_impl,
+        "II->?": numbers.int_ugt_impl,
+        "ll->?": numbers.int_sgt_impl,
+        "LL->?": numbers.int_ugt_impl,
+        "qq->?": numbers.int_sgt_impl,
+        "QQ->?": numbers.int_ugt_impl,
+        "ff->?": numbers.real_gt_impl,
+        "dd->?": numbers.real_gt_impl,
+        "FF->?": npyfuncs.np_complex_gt_impl,
+        "DD->?": npyfuncs.np_complex_gt_impl,
+    }
+    if numpy_version >= (1, 25):
+        ufunc_db[np.greater].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp(">"),
+                "Qq->?": numbers.int_unsigned_signed_cmp(">"),
+            }
+        )
+
+    ufunc_db[np.greater_equal] = {
+        "??->?": numbers.int_uge_impl,
+        "bb->?": numbers.int_sge_impl,
+        "BB->?": numbers.int_uge_impl,
+        "hh->?": numbers.int_sge_impl,
+        "HH->?": numbers.int_uge_impl,
+        "ii->?": numbers.int_sge_impl,
+        "II->?": numbers.int_uge_impl,
+        "ll->?": numbers.int_sge_impl,
+        "LL->?": numbers.int_uge_impl,
+        "qq->?": numbers.int_sge_impl,
+        "QQ->?": numbers.int_uge_impl,
+        "ff->?": numbers.real_ge_impl,
+        "dd->?": numbers.real_ge_impl,
+        "FF->?": npyfuncs.np_complex_ge_impl,
+        "DD->?": npyfuncs.np_complex_ge_impl,
+    }
+    if numpy_version >= (1, 25):
+        ufunc_db[np.greater_equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp(">="),
+                "Qq->?": numbers.int_unsigned_signed_cmp(">="),
+            }
+        )
+
+    ufunc_db[np.less] = {
+        "??->?": numbers.int_ult_impl,
+        "bb->?": numbers.int_slt_impl,
+        "BB->?": numbers.int_ult_impl,
+        "hh->?": numbers.int_slt_impl,
+        "HH->?": numbers.int_ult_impl,
+        "ii->?": numbers.int_slt_impl,
+        "II->?": numbers.int_ult_impl,
+        "ll->?": numbers.int_slt_impl,
+        "LL->?": numbers.int_ult_impl,
+        "qq->?": numbers.int_slt_impl,
+        "QQ->?": numbers.int_ult_impl,
+        "ff->?": numbers.real_lt_impl,
+        "dd->?": numbers.real_lt_impl,
+        "FF->?": npyfuncs.np_complex_lt_impl,
+        "DD->?": npyfuncs.np_complex_lt_impl,
+    }
+    if numpy_version >= (1, 25):
+        ufunc_db[np.less].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("<"),
+                "Qq->?": numbers.int_unsigned_signed_cmp("<"),
+            }
+        )
+
+    ufunc_db[np.less_equal] = {
+        "??->?": numbers.int_ule_impl,
+        "bb->?": numbers.int_sle_impl,
+        "BB->?": numbers.int_ule_impl,
+        "hh->?": numbers.int_sle_impl,
+        "HH->?": numbers.int_ule_impl,
+        "ii->?": numbers.int_sle_impl,
+        "II->?": numbers.int_ule_impl,
+        "ll->?": numbers.int_sle_impl,
+        "LL->?": numbers.int_ule_impl,
+        "qq->?": numbers.int_sle_impl,
+        "QQ->?": numbers.int_ule_impl,
+        "ff->?": numbers.real_le_impl,
+        "dd->?": numbers.real_le_impl,
+        "FF->?": npyfuncs.np_complex_le_impl,
+        "DD->?": npyfuncs.np_complex_le_impl,
+    }
+    if numpy_version >= (1, 25):
+        ufunc_db[np.less_equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("<="),
+                "Qq->?": numbers.int_unsigned_signed_cmp("<="),
+            }
+        )
+
+    ufunc_db[np.not_equal] = {
+        "??->?": numbers.int_ne_impl,
+        "bb->?": numbers.int_ne_impl,
+        "BB->?": numbers.int_ne_impl,
+        "hh->?": numbers.int_ne_impl,
+        "HH->?": numbers.int_ne_impl,
+        "ii->?": numbers.int_ne_impl,
+        "II->?": numbers.int_ne_impl,
+        "ll->?": numbers.int_ne_impl,
+        "LL->?": numbers.int_ne_impl,
+        "qq->?": numbers.int_ne_impl,
+        "QQ->?": numbers.int_ne_impl,
+        "ff->?": numbers.real_ne_impl,
+        "dd->?": numbers.real_ne_impl,
+        "FF->?": npyfuncs.np_complex_ne_impl,
+        "DD->?": npyfuncs.np_complex_ne_impl,
+    }
+    if numpy_version >= (1, 25):
+        ufunc_db[np.not_equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("!="),
+                "Qq->?": numbers.int_unsigned_signed_cmp("!="),
+            }
+        )
+
+    ufunc_db[np.equal] = {
+        "??->?": numbers.int_eq_impl,
+        "bb->?": numbers.int_eq_impl,
+        "BB->?": numbers.int_eq_impl,
+        "hh->?": numbers.int_eq_impl,
+        "HH->?": numbers.int_eq_impl,
+        "ii->?": numbers.int_eq_impl,
+        "II->?": numbers.int_eq_impl,
+        "ll->?": numbers.int_eq_impl,
+        "LL->?": numbers.int_eq_impl,
+        "qq->?": numbers.int_eq_impl,
+        "QQ->?": numbers.int_eq_impl,
+        "ff->?": numbers.real_eq_impl,
+        "dd->?": numbers.real_eq_impl,
+        "FF->?": npyfuncs.np_complex_eq_impl,
+        "DD->?": npyfuncs.np_complex_eq_impl,
+    }
+    if numpy_version >= (1, 25):
+        ufunc_db[np.equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("=="),
+                "Qq->?": numbers.int_unsigned_signed_cmp("=="),
+            }
+        )
+
+    ufunc_db[np.logical_and] = {
+        "??->?": npyfuncs.np_logical_and_impl,
+        "bb->?": npyfuncs.np_logical_and_impl,
+        "BB->?": npyfuncs.np_logical_and_impl,
+        "hh->?": npyfuncs.np_logical_and_impl,
+        "HH->?": npyfuncs.np_logical_and_impl,
+        "ii->?": npyfuncs.np_logical_and_impl,
+        "II->?": npyfuncs.np_logical_and_impl,
+        "ll->?": npyfuncs.np_logical_and_impl,
+        "LL->?": npyfuncs.np_logical_and_impl,
+        "qq->?": npyfuncs.np_logical_and_impl,
+        "QQ->?": npyfuncs.np_logical_and_impl,
+        "ff->?": npyfuncs.np_logical_and_impl,
+        "dd->?": npyfuncs.np_logical_and_impl,
+        "FF->?": npyfuncs.np_complex_logical_and_impl,
+        "DD->?": npyfuncs.np_complex_logical_and_impl,
+    }
+
+    ufunc_db[np.logical_or] = {
+        "??->?": npyfuncs.np_logical_or_impl,
+        "bb->?": npyfuncs.np_logical_or_impl,
+        "BB->?": npyfuncs.np_logical_or_impl,
+        "hh->?": npyfuncs.np_logical_or_impl,
+        "HH->?": npyfuncs.np_logical_or_impl,
+        "ii->?": npyfuncs.np_logical_or_impl,
+        "II->?": npyfuncs.np_logical_or_impl,
+        "ll->?": npyfuncs.np_logical_or_impl,
+        "LL->?": npyfuncs.np_logical_or_impl,
+        "qq->?": npyfuncs.np_logical_or_impl,
+        "QQ->?": npyfuncs.np_logical_or_impl,
+        "ff->?": npyfuncs.np_logical_or_impl,
+        "dd->?": npyfuncs.np_logical_or_impl,
+        "FF->?": npyfuncs.np_complex_logical_or_impl,
+        "DD->?": npyfuncs.np_complex_logical_or_impl,
+    }
+
+    ufunc_db[np.logical_xor] = {
+        "??->?": npyfuncs.np_logical_xor_impl,
+        "bb->?": npyfuncs.np_logical_xor_impl,
+        "BB->?": npyfuncs.np_logical_xor_impl,
+        "hh->?": npyfuncs.np_logical_xor_impl,
+        "HH->?": npyfuncs.np_logical_xor_impl,
+        "ii->?": npyfuncs.np_logical_xor_impl,
+        "II->?": npyfuncs.np_logical_xor_impl,
+        "ll->?": npyfuncs.np_logical_xor_impl,
+        "LL->?": npyfuncs.np_logical_xor_impl,
+        "qq->?": npyfuncs.np_logical_xor_impl,
+        "QQ->?": npyfuncs.np_logical_xor_impl,
+        "ff->?": npyfuncs.np_logical_xor_impl,
+        "dd->?": npyfuncs.np_logical_xor_impl,
+        "FF->?": npyfuncs.np_complex_logical_xor_impl,
+        "DD->?": npyfuncs.np_complex_logical_xor_impl,
+    }
+
+    ufunc_db[np.logical_not] = {
+        "?->?": npyfuncs.np_logical_not_impl,
+        "b->?": npyfuncs.np_logical_not_impl,
+        "B->?": npyfuncs.np_logical_not_impl,
+        "h->?": npyfuncs.np_logical_not_impl,
+        "H->?": npyfuncs.np_logical_not_impl,
+        "i->?": npyfuncs.np_logical_not_impl,
+        "I->?": npyfuncs.np_logical_not_impl,
+        "l->?": npyfuncs.np_logical_not_impl,
+        "L->?": npyfuncs.np_logical_not_impl,
+        "q->?": npyfuncs.np_logical_not_impl,
+        "Q->?": npyfuncs.np_logical_not_impl,
+        "f->?": npyfuncs.np_logical_not_impl,
+        "d->?": npyfuncs.np_logical_not_impl,
+        "F->?": npyfuncs.np_complex_logical_not_impl,
+        "D->?": npyfuncs.np_complex_logical_not_impl,
+    }
+
+    ufunc_db[np.maximum] = {
+        "??->?": npyfuncs.np_logical_or_impl,
+        "bb->b": npyfuncs.np_int_smax_impl,
+        "BB->B": npyfuncs.np_int_umax_impl,
+        "hh->h": npyfuncs.np_int_smax_impl,
+        "HH->H": npyfuncs.np_int_umax_impl,
+        "ii->i": npyfuncs.np_int_smax_impl,
+        "II->I": npyfuncs.np_int_umax_impl,
+        "ll->l": npyfuncs.np_int_smax_impl,
+        "LL->L": npyfuncs.np_int_umax_impl,
+        "qq->q": npyfuncs.np_int_smax_impl,
+        "QQ->Q": npyfuncs.np_int_umax_impl,
+        "ff->f": npyfuncs.np_real_maximum_impl,
+        "dd->d": npyfuncs.np_real_maximum_impl,
+        "FF->F": npyfuncs.np_complex_maximum_impl,
+        "DD->D": npyfuncs.np_complex_maximum_impl,
+    }
+
+    ufunc_db[np.minimum] = {
+        "??->?": npyfuncs.np_logical_and_impl,
+        "bb->b": npyfuncs.np_int_smin_impl,
+        "BB->B": npyfuncs.np_int_umin_impl,
+        "hh->h": npyfuncs.np_int_smin_impl,
+        "HH->H": npyfuncs.np_int_umin_impl,
+        "ii->i": npyfuncs.np_int_smin_impl,
+        "II->I": npyfuncs.np_int_umin_impl,
+        "ll->l": npyfuncs.np_int_smin_impl,
+        "LL->L": npyfuncs.np_int_umin_impl,
+        "qq->q": npyfuncs.np_int_smin_impl,
+        "QQ->Q": npyfuncs.np_int_umin_impl,
+        "ff->f": npyfuncs.np_real_minimum_impl,
+        "dd->d": npyfuncs.np_real_minimum_impl,
+        "FF->F": npyfuncs.np_complex_minimum_impl,
+        "DD->D": npyfuncs.np_complex_minimum_impl,
+    }
+
+    ufunc_db[np.fmax] = {
+        "??->?": npyfuncs.np_logical_or_impl,
+        "bb->b": npyfuncs.np_int_smax_impl,
+        "BB->B": npyfuncs.np_int_umax_impl,
+        "hh->h": npyfuncs.np_int_smax_impl,
+        "HH->H": npyfuncs.np_int_umax_impl,
+        "ii->i": npyfuncs.np_int_smax_impl,
+        "II->I": npyfuncs.np_int_umax_impl,
+        "ll->l": npyfuncs.np_int_smax_impl,
+        "LL->L": npyfuncs.np_int_umax_impl,
+        "qq->q": npyfuncs.np_int_smax_impl,
+        "QQ->Q": npyfuncs.np_int_umax_impl,
+        "ff->f": npyfuncs.np_real_fmax_impl,
+        "dd->d": npyfuncs.np_real_fmax_impl,
+        "FF->F": npyfuncs.np_complex_fmax_impl,
+        "DD->D": npyfuncs.np_complex_fmax_impl,
+    }
+
+    ufunc_db[np.fmin] = {
+        "??->?": npyfuncs.np_logical_and_impl,
+        "bb->b": npyfuncs.np_int_smin_impl,
+        "BB->B": npyfuncs.np_int_umin_impl,
+        "hh->h": npyfuncs.np_int_smin_impl,
+        "HH->H": npyfuncs.np_int_umin_impl,
+        "ii->i": npyfuncs.np_int_smin_impl,
+        "II->I": npyfuncs.np_int_umin_impl,
+        "ll->l": npyfuncs.np_int_smin_impl,
+        "LL->L": npyfuncs.np_int_umin_impl,
+        "qq->q": npyfuncs.np_int_smin_impl,
+        "QQ->Q": npyfuncs.np_int_umin_impl,
+        "ff->f": npyfuncs.np_real_fmin_impl,
+        "dd->d": npyfuncs.np_real_fmin_impl,
+        "FF->F": npyfuncs.np_complex_fmin_impl,
+        "DD->D": npyfuncs.np_complex_fmin_impl,
+    }
+
+    # misc floating functions
+    ufunc_db[np.isnan] = {
+        "f->?": npyfuncs.np_real_isnan_impl,
+        "d->?": npyfuncs.np_real_isnan_impl,
+        "F->?": npyfuncs.np_complex_isnan_impl,
+        "D->?": npyfuncs.np_complex_isnan_impl,
+        # int8
+        "b->?": npyfuncs.np_int_isnan_impl,
+        "B->?": npyfuncs.np_int_isnan_impl,
+        # int16
+        "h->?": npyfuncs.np_int_isnan_impl,
+        "H->?": npyfuncs.np_int_isnan_impl,
+        # int32
+        "i->?": npyfuncs.np_int_isnan_impl,
+        "I->?": npyfuncs.np_int_isnan_impl,
+        # int64
+        "l->?": npyfuncs.np_int_isnan_impl,
+        "L->?": npyfuncs.np_int_isnan_impl,
+        # intp
+        "q->?": npyfuncs.np_int_isnan_impl,
+        "Q->?": npyfuncs.np_int_isnan_impl,
+        # boolean
+        "?->?": npyfuncs.np_int_isnan_impl,
+        # datetime & timedelta
+        "m->?": npyfuncs.np_datetime_isnat_impl,
+        "M->?": npyfuncs.np_datetime_isnat_impl,
+    }
+
+    ufunc_db[np.isinf] = {
+        "f->?": npyfuncs.np_real_isinf_impl,
+        "d->?": npyfuncs.np_real_isinf_impl,
+        "F->?": npyfuncs.np_complex_isinf_impl,
+        "D->?": npyfuncs.np_complex_isinf_impl,
+        # int8
+        "b->?": npyfuncs.np_int_isinf_impl,
+        "B->?": npyfuncs.np_int_isinf_impl,
+        # int16
+        "h->?": npyfuncs.np_int_isinf_impl,
+        "H->?": npyfuncs.np_int_isinf_impl,
+        # int32
+        "i->?": npyfuncs.np_int_isinf_impl,
+        "I->?": npyfuncs.np_int_isinf_impl,
+        # int64
+        "l->?": npyfuncs.np_int_isinf_impl,
+        "L->?": npyfuncs.np_int_isinf_impl,
+        # intp
+        "q->?": npyfuncs.np_int_isinf_impl,
+        "Q->?": npyfuncs.np_int_isinf_impl,
+        # boolean
+        "?->?": npyfuncs.np_int_isinf_impl,
+        # datetime & timedelta
+        "m->?": npyfuncs.np_int_isinf_impl,
+        "M->?": npyfuncs.np_int_isinf_impl,
+    }
+
+    ufunc_db[np.isfinite] = {
+        "f->?": npyfuncs.np_real_isfinite_impl,
+        "d->?": npyfuncs.np_real_isfinite_impl,
+        "F->?": npyfuncs.np_complex_isfinite_impl,
+        "D->?": npyfuncs.np_complex_isfinite_impl,
+        # int8
+        "b->?": npyfuncs.np_int_isfinite_impl,
+        "B->?": npyfuncs.np_int_isfinite_impl,
+        # int16
+        "h->?": npyfuncs.np_int_isfinite_impl,
+        "H->?": npyfuncs.np_int_isfinite_impl,
+        # int32
+        "i->?": npyfuncs.np_int_isfinite_impl,
+        "I->?": npyfuncs.np_int_isfinite_impl,
+        # int64
+        "l->?": npyfuncs.np_int_isfinite_impl,
+        "L->?": npyfuncs.np_int_isfinite_impl,
+        # intp
+        "q->?": npyfuncs.np_int_isfinite_impl,
+        "Q->?": npyfuncs.np_int_isfinite_impl,
+        # boolean
+        "?->?": npyfuncs.np_int_isfinite_impl,
+        # datetime & timedelta
+        "M->?": npyfuncs.np_datetime_isfinite_impl,
+        "m->?": npyfuncs.np_datetime_isfinite_impl,
+    }
+
+    ufunc_db[np.signbit] = {
+        "f->?": npyfuncs.np_real_signbit_impl,
+        "d->?": npyfuncs.np_real_signbit_impl,
+    }
+
+    ufunc_db[np.copysign] = {
+        "ff->f": npyfuncs.np_real_copysign_impl,
+        "dd->d": npyfuncs.np_real_copysign_impl,
+    }
+
+    ufunc_db[np.nextafter] = {
+        "ff->f": npyfuncs.np_real_nextafter_impl,
+        "dd->d": npyfuncs.np_real_nextafter_impl,
+    }
+
+    ufunc_db[np.spacing] = {
+        "f->f": npyfuncs.np_real_spacing_impl,
+        "d->d": npyfuncs.np_real_spacing_impl,
+    }
+
+    ufunc_db[np.ldexp] = {
+        "fi->f": npyfuncs.np_real_ldexp_impl,
+        "fl->f": npyfuncs.np_real_ldexp_impl,
+        "di->d": npyfuncs.np_real_ldexp_impl,
+        "dl->d": npyfuncs.np_real_ldexp_impl,
+    }
+    if numpy_version >= (2, 0) and IS_WIN32:
+        ufunc_db[np.ldexp]["fq->f"] = ufunc_db[np.ldexp].pop("fl->f")
+        ufunc_db[np.ldexp]["dq->d"] = ufunc_db[np.ldexp].pop("dl->d")
+
+    # bit twiddling functions
+    ufunc_db[np.bitwise_and] = {
+        "??->?": numbers.int_and_impl,
+        "bb->b": numbers.int_and_impl,
+        "BB->B": numbers.int_and_impl,
+        "hh->h": numbers.int_and_impl,
+        "HH->H": numbers.int_and_impl,
+        "ii->i": numbers.int_and_impl,
+        "II->I": numbers.int_and_impl,
+        "ll->l": numbers.int_and_impl,
+        "LL->L": numbers.int_and_impl,
+        "qq->q": numbers.int_and_impl,
+        "QQ->Q": numbers.int_and_impl,
+    }
+
+    ufunc_db[np.bitwise_or] = {
+        "??->?": numbers.int_or_impl,
+        "bb->b": numbers.int_or_impl,
+        "BB->B": numbers.int_or_impl,
+        "hh->h": numbers.int_or_impl,
+        "HH->H": numbers.int_or_impl,
+        "ii->i": numbers.int_or_impl,
+        "II->I": numbers.int_or_impl,
+        "ll->l": numbers.int_or_impl,
+        "LL->L": numbers.int_or_impl,
+        "qq->q": numbers.int_or_impl,
+        "QQ->Q": numbers.int_or_impl,
+    }
+
+    ufunc_db[np.bitwise_xor] = {
+        "??->?": numbers.int_xor_impl,
+        "bb->b": numbers.int_xor_impl,
+        "BB->B": numbers.int_xor_impl,
+        "hh->h": numbers.int_xor_impl,
+        "HH->H": numbers.int_xor_impl,
+        "ii->i": numbers.int_xor_impl,
+        "II->I": numbers.int_xor_impl,
+        "ll->l": numbers.int_xor_impl,
+        "LL->L": numbers.int_xor_impl,
+        "qq->q": numbers.int_xor_impl,
+        "QQ->Q": numbers.int_xor_impl,
+    }
+
+    ufunc_db[np.invert] = {  # aka np.bitwise_not
+        "?->?": numbers.int_invert_impl,
+        "b->b": numbers.int_invert_impl,
+        "B->B": numbers.int_invert_impl,
+        "h->h": numbers.int_invert_impl,
+        "H->H": numbers.int_invert_impl,
+        "i->i": numbers.int_invert_impl,
+        "I->I": numbers.int_invert_impl,
+        "l->l": numbers.int_invert_impl,
+        "L->L": numbers.int_invert_impl,
+        "q->q": numbers.int_invert_impl,
+        "Q->Q": numbers.int_invert_impl,
+    }
+
+    ufunc_db[np.left_shift] = {
+        "bb->b": numbers.int_shl_impl,
+        "BB->B": numbers.int_shl_impl,
+        "hh->h": numbers.int_shl_impl,
+        "HH->H": numbers.int_shl_impl,
+        "ii->i": numbers.int_shl_impl,
+        "II->I": numbers.int_shl_impl,
+        "ll->l": numbers.int_shl_impl,
+        "LL->L": numbers.int_shl_impl,
+        "qq->q": numbers.int_shl_impl,
+        "QQ->Q": numbers.int_shl_impl,
+    }
+
+    ufunc_db[np.right_shift] = {
+        "bb->b": numbers.int_shr_impl,
+        "BB->B": numbers.int_shr_impl,
+        "hh->h": numbers.int_shr_impl,
+        "HH->H": numbers.int_shr_impl,
+        "ii->i": numbers.int_shr_impl,
+        "II->I": numbers.int_shr_impl,
+        "ll->l": numbers.int_shr_impl,
+        "LL->L": numbers.int_shr_impl,
+        "qq->q": numbers.int_shr_impl,
+        "QQ->Q": numbers.int_shr_impl,
+    }
+
+    # Inject datetime64 support
+    from numba.np import npdatetime
+
+    ufunc_db[np.negative].update(
+        {
+            "m->m": npdatetime.timedelta_neg_impl,
+        }
+    )
+    ufunc_db[np.positive].update(
+        {
+            "m->m": npdatetime.timedelta_pos_impl,
+        }
+    )
+    ufunc_db[np.absolute].update(
+        {
+            "m->m": npdatetime.timedelta_abs_impl,
+        }
+    )
+    ufunc_db[np.sign].update(
+        {
+            "m->m": npdatetime.timedelta_sign_impl,
+        }
+    )
+    ufunc_db[np.add].update(
+        {
+            "mm->m": npdatetime.timedelta_add_impl,
+            "Mm->M": npdatetime.datetime_plus_timedelta,
+            "mM->M": npdatetime.timedelta_plus_datetime,
+        }
+    )
+    ufunc_db[np.subtract].update(
+        {
+            "mm->m": npdatetime.timedelta_sub_impl,
+            "Mm->M": npdatetime.datetime_minus_timedelta,
+            "MM->m": npdatetime.datetime_minus_datetime,
+        }
+    )
+    ufunc_db[np.multiply].update(
+        {
+            "mq->m": npdatetime.timedelta_times_number,
+            "md->m": npdatetime.timedelta_times_number,
+            "qm->m": npdatetime.number_times_timedelta,
+            "dm->m": npdatetime.number_times_timedelta,
+        }
+    )
+    if np.divide != np.true_divide:
+        ufunc_db[np.divide].update(
+            {
+                "mq->m": npdatetime.timedelta_over_number,
+                "md->m": npdatetime.timedelta_over_number,
+                "mm->d": npdatetime.timedelta_over_timedelta,
+            }
+        )
+    ufunc_db[np.true_divide].update(
+        {
+            "mq->m": npdatetime.timedelta_over_number,
+            "md->m": npdatetime.timedelta_over_number,
+            "mm->d": npdatetime.timedelta_over_timedelta,
+        }
+    )
+    ufunc_db[np.floor_divide].update(
+        {
+            "mq->m": npdatetime.timedelta_over_number,
+            "md->m": npdatetime.timedelta_over_number,
+        }
+    )
+
+    ufunc_db[np.floor_divide].update(
+        {
+            "mm->q": npdatetime.timedelta_floor_div_timedelta,
+        }
+    )
+
+    ufunc_db[np.equal].update(
+        {
+            "MM->?": npdatetime.datetime_eq_datetime_impl,
+            "mm->?": npdatetime.timedelta_eq_timedelta_impl,
+        }
+    )
+    ufunc_db[np.not_equal].update(
+        {
+            "MM->?": npdatetime.datetime_ne_datetime_impl,
+            "mm->?": npdatetime.timedelta_ne_timedelta_impl,
+        }
+    )
+    ufunc_db[np.less].update(
+        {
+            "MM->?": npdatetime.datetime_lt_datetime_impl,
+            "mm->?": npdatetime.timedelta_lt_timedelta_impl,
+        }
+    )
+    ufunc_db[np.less_equal].update(
+        {
+            "MM->?": npdatetime.datetime_le_datetime_impl,
+            "mm->?": npdatetime.timedelta_le_timedelta_impl,
+        }
+    )
+    ufunc_db[np.greater].update(
+        {
+            "MM->?": npdatetime.datetime_gt_datetime_impl,
+            "mm->?": npdatetime.timedelta_gt_timedelta_impl,
+        }
+    )
+    ufunc_db[np.greater_equal].update(
+        {
+            "MM->?": npdatetime.datetime_ge_datetime_impl,
+            "mm->?": npdatetime.timedelta_ge_timedelta_impl,
+        }
+    )
+    ufunc_db[np.maximum].update(
+        {
+            "MM->M": npdatetime.datetime_maximum_impl,
+            "mm->m": npdatetime.timedelta_maximum_impl,
+        }
+    )
+    ufunc_db[np.minimum].update(
+        {
+            "MM->M": npdatetime.datetime_minimum_impl,
+            "mm->m": npdatetime.timedelta_minimum_impl,
+        }
+    )
+    # there is no difference for datetime/timedelta in maximum/fmax
+    # and minimum/fmin
+    ufunc_db[np.fmax].update(
+        {
+            "MM->M": npdatetime.datetime_fmax_impl,
+            "mm->m": npdatetime.timedelta_fmax_impl,
+        }
+    )
+    ufunc_db[np.fmin].update(
+        {
+            "MM->M": npdatetime.datetime_fmin_impl,
+            "mm->m": npdatetime.timedelta_fmin_impl,
+        }
+    )
+
+    ufunc_db[np.remainder].update(
+        {
+            "mm->m": npdatetime.timedelta_mod_timedelta,
+        }
+    )
diff --git a/numba_cuda/numba/cuda/np/unsafe/__init__.py b/numba_cuda/numba/cuda/np/unsafe/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/numba_cuda/numba/cuda/np/unsafe/ndarray.py b/numba_cuda/numba/cuda/np/unsafe/ndarray.py
new file mode 100644
index 000000000..1ee871b00
--- /dev/null
+++ b/numba_cuda/numba/cuda/np/unsafe/ndarray.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+This file provides internal compiler utilities that support certain special
+operations with numpy.
+"""
+
+from numba.core import types, typing
+from numba.cuda.cgutils import unpack_tuple
+from numba.cuda.extending import intrinsic
+from numba.core.imputils import impl_ret_new_ref
+from numba.core.errors import RequireLiteralValue, TypingError
+
+from numba.cpython.unsafe.tuple import tuple_setitem
+
+
+@intrinsic
+def empty_inferred(typingctx, shape):
+    """A version of numpy.empty whose dtype is inferred by the type system.
+
+    Expects `shape` to be a int-tuple.
+
+    There is special logic in the type-inferencer to handle the "refine"-ing
+    of undefined dtype.
+    """
+    from numba.cuda.np.arrayobj import _empty_nd_impl
+
+    def codegen(context, builder, signature, args):
+        # check that the return type is now defined
+        arrty = signature.return_type
+        assert arrty.is_precise()
+        shapes = unpack_tuple(builder, args[0])
+        # redirect implementation to np.empty
+        res = _empty_nd_impl(context, builder, arrty, shapes)
+        return impl_ret_new_ref(context, builder, arrty, res._getvalue())
+
+    # make function signature
+    nd = len(shape)
+    array_ty = types.Array(ndim=nd, layout="C", dtype=types.undefined)
+    sig = array_ty(shape)
+    return sig, codegen
+
+
+@intrinsic
+def to_fixed_tuple(typingctx, array, length):
+    """Convert *array* into a tuple of *length*
+
+    Returns ``UniTuple(array.dtype, length)``
+
+    ** Warning **
+    - No boundchecking.
+      If *length* is longer than *array.size*, the behavior is undefined.
+    """
+    if not isinstance(length, types.IntegerLiteral):
+        raise RequireLiteralValue("*length* argument must be a constant")
+
+    if array.ndim != 1:
+        raise TypingError("Not supported on array.ndim={}".format(array.ndim))
+
+    # Determine types
+    tuple_size = int(length.literal_value)
+    tuple_type = types.UniTuple(dtype=array.dtype, count=tuple_size)
+    sig = tuple_type(array, length)
+
+    def codegen(context, builder, signature, args):
+        def impl(array, length, empty_tuple):
+            out = empty_tuple
+            for i in range(length):
+                out = tuple_setitem(out, i, array[i])
+            return out
+
+        inner_argtypes = [signature.args[0], types.intp, tuple_type]
+        inner_sig = typing.signature(tuple_type, *inner_argtypes)
+        ll_idx_type = context.get_value_type(types.intp)
+        # Allocate an empty tuple
+        empty_tuple = context.get_constant_undef(tuple_type)
+        inner_args = [args[0], ll_idx_type(tuple_size), empty_tuple]
+
+        res = context.compile_internal(builder, impl, inner_sig, inner_args)
+        return res
+
+    return sig, codegen
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index e0e5ee87a..9eb865dbb 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -169,7 +169,7 @@ def load_additional_registries(self):
         from numba.cpython import rangeobj, enumimpl  # noqa: F401
         from numba.core import optional  # noqa: F401
         from numba.misc import cffiimpl
-        from numba.np import arrayobj  # noqa: F401
+        from numba.cuda.np import arrayobj  # noqa: F401
         from numba.np import npdatetime  # noqa: F401
         from . import (
             cudaimpl,
@@ -182,7 +182,7 @@ def load_additional_registries(self):
         )
 
         # fix for #8940
-        from numba.np.unsafe import ndarray  # noqa F401
+        from numba.cuda.np.unsafe import ndarray  # noqa F401
 
         self.install_registry(cudaimpl.registry)
         self.install_registry(cffiimpl.registry)
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
index 8bc57b24d..c9f8de1cb 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
@@ -8,7 +8,7 @@
 
 from numba import cuda, types, njit, typeof
 from numba.cuda import config
-from numba.np import numpy_support
+from numba.cuda.np import numpy_support
 from numba.cuda.tests.support import TestCase
 from numba.cuda.tests.support import MemoryLeakMixin
 
diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_import.py b/numba_cuda/numba/cuda/tests/nocuda/test_import.py
index 17e1c3074..054752d38 100644
--- a/numba_cuda/numba/cuda/tests/nocuda/test_import.py
+++ b/numba_cuda/numba/cuda/tests/nocuda/test_import.py
@@ -43,6 +43,7 @@ def test_no_impl_import(self):
             "numba.np.arraymath",
             "numba.np.npdatetime",
             "numba.np.npyimpl",
+            "numba.cuda.np.npyimpl",
             "numba.typed.typeddict",
             "numba.typed.typedlist",
         )