From 1ae469cfcde904bf2c85c430f17db538b1fd192c Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Mon, 4 Aug 2025 08:55:21 -0700
Subject: [PATCH 01/62] [Refactor][NFC] Vendor in _DispatcherBase for
 CUDA-specific changes

---
 numba_cuda/numba/cuda/dispatcher.py | 608 +++++++++++++++++++++++++++-
 1 file changed, 602 insertions(+), 6 deletions(-)

diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 9f69955b7..68706bfd1 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -8,16 +8,16 @@
 import weakref
 import uuid
 
-from numba.core import compiler, types, typing, config
+from numba.core import compiler, types, typing, config, errors, entrypoints
 from numba.cuda import serialize, utils
 from numba.cuda.core.caching import Cache, CacheImpl, NullCache
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.dispatcher import _DispatcherBase
-from numba.core.errors import NumbaPerformanceWarning, TypingError
+from numba.core.typeconv.rules import default_type_manager
 from numba.cuda.typing.templates import fold_arguments
 from numba.core.typing.typeof import Purpose, typeof
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
+from numba.core.bytecode import get_code_object
 from numba.cuda.compiler import (
     compile_cuda,
     CUDACompiler,
@@ -691,7 +691,7 @@ def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
                     f"Grid size {grid_size} will likely result in GPU "
                     "under-utilization due to low occupancy."
                 )
-                warn(NumbaPerformanceWarning(msg))
+                warn(errors.NumbaPerformanceWarning(msg))
 
     def __call__(self, *args):
         return self.dispatcher.call(
@@ -734,6 +734,602 @@ def load_overload(self, sig, target_context):
             return super().load_overload(sig, target_context)
 
 
+class OmittedArg(object):
+    """
+    A placeholder for omitted arguments with a default value.
+    """
+
+    def __init__(self, value):
+        self.value = value
+
+    def __repr__(self):
+        return "omitted arg(%r)" % (self.value,)
+
+    @property
+    def _numba_type_(self):
+        return types.Omitted(self.value)
+
+
+class CompilingCounter(object):
+    """
+    A simple counter that increment in __enter__ and decrement in __exit__.
+    """
+
+    def __init__(self):
+        self.counter = 0
+
+    def __enter__(self):
+        assert self.counter >= 0
+        self.counter += 1
+
+    def __exit__(self, *args, **kwargs):
+        self.counter -= 1
+        assert self.counter >= 0
+
+    def __bool__(self):
+        return self.counter > 0
+
+    __nonzero__ = __bool__
+
+
+class _DispatcherBase(_dispatcher.Dispatcher):
+    """
+    Common base class for dispatcher Implementations.
+    """
+
+    __numba__ = "py_func"
+
+    def __init__(
+        self, arg_count, py_func, pysig, can_fallback, exact_match_required
+    ):
+        self._tm = default_type_manager
+
+        # A mapping of signatures to compile results
+        self.overloads = collections.OrderedDict()
+
+        self.py_func = py_func
+        # other parts of Numba assume the old Python 2 name for code object
+        self.func_code = get_code_object(py_func)
+        # but newer python uses a different name
+        self.__code__ = self.func_code
+        # a place to keep an active reference to the types of the active call
+        self._types_active_call = set()
+        # Default argument values match the py_func
+        self.__defaults__ = py_func.__defaults__
+
+        argnames = tuple(pysig.parameters)
+        default_values = self.py_func.__defaults__ or ()
+        defargs = tuple(OmittedArg(val) for val in default_values)
+        try:
+            lastarg = list(pysig.parameters.values())[-1]
+        except IndexError:
+            has_stararg = False
+        else:
+            has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL
+        _dispatcher.Dispatcher.__init__(
+            self,
+            self._tm.get_pointer(),
+            arg_count,
+            self._fold_args,
+            argnames,
+            defargs,
+            can_fallback,
+            has_stararg,
+            exact_match_required,
+        )
+
+        self.doc = py_func.__doc__
+        self._compiling_counter = CompilingCounter()
+        self._enable_sysmon = bool(config.ENABLE_SYS_MONITORING)
+        weakref.finalize(self, self._make_finalizer())
+
+    def _compilation_chain_init_hook(self):
+        """
+        This will be called ahead of any part of compilation taking place (this
+        even includes being ahead of working out the types of the arguments).
+        This permits activities such as initialising extension entry points so
+        that the compiler knows about additional externally defined types etc
+        before it does anything.
+        """
+        entrypoints.init_all()
+
+    def _reset_overloads(self):
+        self._clear()
+        self.overloads.clear()
+
+    def _make_finalizer(self):
+        """
+        Return a finalizer function that will release references to
+        related compiled functions.
+        """
+        overloads = self.overloads
+        targetctx = self.targetctx
+
+        # Early-bind utils.shutting_down() into the function's local namespace
+        # (see issue #689)
+        def finalizer(shutting_down=utils.shutting_down):
+            # The finalizer may crash at shutdown, skip it (resources
+            # will be cleared by the process exiting, anyway).
+            if shutting_down():
+                return
+            # This function must *not* hold any reference to self:
+            # we take care to bind the necessary objects in the closure.
+            for cres in overloads.values():
+                try:
+                    targetctx.remove_user_function(cres.entry_point)
+                except KeyError:
+                    pass
+
+        return finalizer
+
+    @property
+    def signatures(self):
+        """
+        Returns a list of compiled function signatures.
+        """
+        return list(self.overloads)
+
+    @property
+    def nopython_signatures(self):
+        return [
+            cres.signature
+            for cres in self.overloads.values()
+            if not cres.objectmode
+        ]
+
+    def disable_compile(self, val=True):
+        """Disable the compilation of new signatures at call time."""
+        # If disabling compilation then there must be at least one signature
+        assert (not val) or len(self.signatures) > 0
+        self._can_compile = not val
+
+    def add_overload(self, cres):
+        args = tuple(cres.signature.args)
+        sig = [a._code for a in args]
+        self._insert(sig, cres.entry_point, cres.objectmode)
+        self.overloads[args] = cres
+
+    def fold_argument_types(self, args, kws):
+        return self._compiler.fold_argument_types(args, kws)
+
+    def get_call_template(self, args, kws):
+        """
+        Get a typing.ConcreteTemplate for this dispatcher and the given
+        *args* and *kws* types.  This allows to resolve the return type.
+
+        A (template, pysig, args, kws) tuple is returned.
+        """
+        # XXX how about a dispatcher template class automating the
+        # following?
+
+        # Fold keyword arguments and resolve default values
+        pysig, args = self._compiler.fold_argument_types(args, kws)
+        kws = {}
+        # Ensure an overload is available
+        if self._can_compile:
+            self.compile(tuple(args))
+
+        # Create function type for typing
+        func_name = self.py_func.__name__
+        name = "CallTemplate({0})".format(func_name)
+        # The `key` isn't really used except for diagnosis here,
+        # so avoid keeping a reference to `cfunc`.
+        call_template = typing.make_concrete_template(
+            name, key=func_name, signatures=self.nopython_signatures
+        )
+        return call_template, pysig, args, kws
+
+    def get_overload(self, sig):
+        """
+        Return the compiled function for the given signature.
+        """
+        args, return_type = sigutils.normalize_signature(sig)
+        return self.overloads[tuple(args)].entry_point
+
+    @property
+    def is_compiling(self):
+        """
+        Whether a specialization is currently being compiled.
+        """
+        return self._compiling_counter
+
+    def _compile_for_args(self, *args, **kws):
+        """
+        For internal use.  Compile a specialized version of the function
+        for the given *args* and *kws*, and return the resulting callable.
+        """
+        assert not kws
+        # call any initialisation required for the compilation chain (e.g.
+        # extension point registration).
+        self._compilation_chain_init_hook()
+
+        def error_rewrite(e, issue_type):
+            """
+            Rewrite and raise Exception `e` with help supplied based on the
+            specified issue_type.
+            """
+            if config.SHOW_HELP:
+                help_msg = errors.error_extras[issue_type]
+                e.patch_message("\n".join((str(e).rstrip(), help_msg)))
+            if config.FULL_TRACEBACKS:
+                raise e
+            else:
+                raise e.with_traceback(None)
+
+        argtypes = []
+        for a in args:
+            if isinstance(a, OmittedArg):
+                argtypes.append(types.Omitted(a.value))
+            else:
+                argtypes.append(self.typeof_pyval(a))
+
+        return_val = None
+        try:
+            return_val = self.compile(tuple(argtypes))
+        except errors.ForceLiteralArg as e:
+            # Received request for compiler re-entry with the list of arguments
+            # indicated by e.requested_args.
+            # First, check if any of these args are already Literal-ized
+            already_lit_pos = [
+                i
+                for i in e.requested_args
+                if isinstance(args[i], types.Literal)
+            ]
+            if already_lit_pos:
+                # Abort compilation if any argument is already a Literal.
+                # Letting this continue will cause infinite compilation loop.
+                m = (
+                    "Repeated literal typing request.\n"
+                    "{}.\n"
+                    "This is likely caused by an error in typing. "
+                    "Please see nested and suppressed exceptions."
+                )
+                info = ", ".join(
+                    "Arg #{} is {}".format(i, args[i])
+                    for i in sorted(already_lit_pos)
+                )
+                raise errors.CompilerError(m.format(info))
+            # Convert requested arguments into a Literal.
+            args = [
+                (types.literal if i in e.requested_args else lambda x: x)(
+                    args[i]
+                )
+                for i, v in enumerate(args)
+            ]
+            # Re-enter compilation with the Literal-ized arguments
+            return_val = self._compile_for_args(*args)
+
+        except errors.TypingError as e:
+            # Intercept typing error that may be due to an argument
+            # that failed inferencing as a Numba type
+            failed_args = []
+            for i, arg in enumerate(args):
+                val = arg.value if isinstance(arg, OmittedArg) else arg
+                try:
+                    tp = typeof(val, Purpose.argument)
+                except ValueError as typeof_exc:
+                    failed_args.append((i, str(typeof_exc)))
+                else:
+                    if tp is None:
+                        failed_args.append(
+                            (i, f"cannot determine Numba type of value {val}")
+                        )
+            if failed_args:
+                # Patch error message to ease debugging
+                args_str = "\n".join(
+                    f"- argument {i}: {err}" for i, err in failed_args
+                )
+                msg = (
+                    f"{str(e).rstrip()} \n\nThis error may have been caused "
+                    f"by the following argument(s):\n{args_str}\n"
+                )
+                e.patch_message(msg)
+
+            error_rewrite(e, "typing")
+        except errors.UnsupportedError as e:
+            # Something unsupported is present in the user code, add help info
+            error_rewrite(e, "unsupported_error")
+        except (
+            errors.NotDefinedError,
+            errors.RedefinedError,
+            errors.VerificationError,
+        ) as e:
+            # These errors are probably from an issue with either the code
+            # supplied being syntactically or otherwise invalid
+            error_rewrite(e, "interpreter")
+        except errors.ConstantInferenceError as e:
+            # this is from trying to infer something as constant when it isn't
+            # or isn't supported as a constant
+            error_rewrite(e, "constant_inference")
+        except Exception as e:
+            if config.SHOW_HELP:
+                if hasattr(e, "patch_message"):
+                    help_msg = errors.error_extras["reportable"]
+                    e.patch_message("\n".join((str(e).rstrip(), help_msg)))
+            # ignore the FULL_TRACEBACKS config, this needs reporting!
+            raise e
+        finally:
+            self._types_active_call.clear()
+        return return_val
+
+    def inspect_llvm(self, signature=None):
+        """Get the LLVM intermediate representation generated by compilation.
+
+        Parameters
+        ----------
+        signature : tuple of numba types, optional
+            Specify a signature for which to obtain the LLVM IR. If None, the
+            IR is returned for all available signatures.
+
+        Returns
+        -------
+        llvm : dict[signature, str] or str
+            Either the LLVM IR string for the specified signature, or, if no
+            signature was given, a dictionary mapping signatures to LLVM IR
+            strings.
+        """
+        if signature is not None:
+            lib = self.overloads[signature].library
+            return lib.get_llvm_str()
+
+        return dict((sig, self.inspect_llvm(sig)) for sig in self.signatures)
+
+    def inspect_asm(self, signature=None):
+        """Get the generated assembly code.
+
+        Parameters
+        ----------
+        signature : tuple of numba types, optional
+            Specify a signature for which to obtain the assembly code. If
+            None, the assembly code is returned for all available signatures.
+
+        Returns
+        -------
+        asm : dict[signature, str] or str
+            Either the assembly code for the specified signature, or, if no
+            signature was given, a dictionary mapping signatures to assembly
+            code.
+        """
+        if signature is not None:
+            lib = self.overloads[signature].library
+            return lib.get_asm_str()
+
+        return dict((sig, self.inspect_asm(sig)) for sig in self.signatures)
+
+    def inspect_types(
+        self, file=None, signature=None, pretty=False, style="default", **kwargs
+    ):
+        """Print/return Numba intermediate representation (IR)-annotated code.
+
+        Parameters
+        ----------
+        file : file-like object, optional
+            File to which to print. Defaults to sys.stdout if None. Must be
+            None if ``pretty=True``.
+        signature : tuple of numba types, optional
+            Print/return the intermediate representation for only the given
+            signature. If None, the IR is printed for all available signatures.
+        pretty : bool, optional
+            If True, an Annotate object will be returned that can render the
+            IR with color highlighting in Jupyter and IPython. ``file`` must
+            be None if ``pretty`` is True. Additionally, the ``pygments``
+            library must be installed for ``pretty=True``.
+        style : str, optional
+            Choose a style for rendering. Ignored if ``pretty`` is ``False``.
+            This is directly consumed by ``pygments`` formatters. To see a
+            list of available styles, import ``pygments`` and run
+            ``list(pygments.styles.get_all_styles())``.
+
+        Returns
+        -------
+        annotated : Annotate object, optional
+            Only returned if ``pretty=True``, otherwise this function is only
+            used for its printing side effect. If ``pretty=True``, an Annotate
+            object is returned that can render itself in Jupyter and IPython.
+        """
+        overloads = self.overloads
+        if signature is not None:
+            overloads = {signature: self.overloads[signature]}
+
+        if not pretty:
+            if file is None:
+                file = sys.stdout
+
+            for ver, res in overloads.items():
+                print("%s %s" % (self.py_func.__name__, ver), file=file)
+                print("-" * 80, file=file)
+                print(res.type_annotation, file=file)
+                print("=" * 80, file=file)
+        else:
+            if file is not None:
+                raise ValueError("`file` must be None if `pretty=True`")
+            from numba.core.annotations.pretty_annotate import Annotate
+
+            return Annotate(self, signature=signature, style=style)
+
+    def inspect_cfg(self, signature=None, show_wrapper=None, **kwargs):
+        """
+        For inspecting the CFG of the function.
+
+        By default the CFG of the user function is shown.  The *show_wrapper*
+        option can be set to "python" or "cfunc" to show the python wrapper
+        function or the *cfunc* wrapper function, respectively.
+
+        Parameters accepted in kwargs
+        -----------------------------
+        filename : string, optional
+            the name of the output file, if given this will write the output to
+            filename
+        view : bool, optional
+            whether to immediately view the optional output file
+        highlight : bool, set, dict, optional
+            what, if anything, to highlight, options are:
+            { incref : bool, # highlight NRT_incref calls
+              decref : bool, # highlight NRT_decref calls
+              returns : bool, # highlight exits which are normal returns
+              raises : bool, # highlight exits which are from raise
+              meminfo : bool, # highlight calls to NRT*meminfo
+              branches : bool, # highlight true/false branches
+             }
+            Default is True which sets all of the above to True. Supplying a set
+            of strings is also accepted, these are interpreted as key:True with
+            respect to the above dictionary. e.g. {'incref', 'decref'} would
+            switch on highlighting on increfs and decrefs.
+        interleave: bool, set, dict, optional
+            what, if anything, to interleave in the LLVM IR, options are:
+            { python: bool # interleave python source code with the LLVM IR
+              lineinfo: bool # interleave line information markers with the LLVM
+                             # IR
+            }
+            Default is True which sets all of the above to True. Supplying a set
+            of strings is also accepted, these are interpreted as key:True with
+            respect to the above dictionary. e.g. {'python',} would
+            switch on interleaving of python source code in the LLVM IR.
+        strip_ir : bool, optional
+            Default is False. If set to True all LLVM IR that is superfluous to
+            that requested in kwarg `highlight` will be removed.
+        show_key : bool, optional
+            Default is True. Create a "key" for the highlighting in the rendered
+            CFG.
+        fontsize : int, optional
+            Default is 8. Set the fontsize in the output to this value.
+        """
+        if signature is not None:
+            cres = self.overloads[signature]
+            lib = cres.library
+            if show_wrapper == "python":
+                fname = cres.fndesc.llvm_cpython_wrapper_name
+            elif show_wrapper == "cfunc":
+                fname = cres.fndesc.llvm_cfunc_wrapper_name
+            else:
+                fname = cres.fndesc.mangled_name
+            return lib.get_function_cfg(fname, py_func=self.py_func, **kwargs)
+
+        return dict(
+            (sig, self.inspect_cfg(sig, show_wrapper=show_wrapper))
+            for sig in self.signatures
+        )
+
+    def inspect_disasm_cfg(self, signature=None):
+        """
+        For inspecting the CFG of the disassembly of the function.
+
+        Requires python package: r2pipe
+        Requires radare2 binary on $PATH.
+        Notebook rendering requires python package: graphviz
+
+        signature : tuple of Numba types, optional
+            Print/return the disassembly CFG for only the given signatures.
+            If None, the IR is printed for all available signatures.
+        """
+        if signature is not None:
+            cres = self.overloads[signature]
+            lib = cres.library
+            return lib.get_disasm_cfg(cres.fndesc.mangled_name)
+
+        return dict(
+            (sig, self.inspect_disasm_cfg(sig)) for sig in self.signatures
+        )
+
+    def get_annotation_info(self, signature=None):
+        """
+        Gets the annotation information for the function specified by
+        signature. If no signature is supplied a dictionary of signature to
+        annotation information is returned.
+        """
+        signatures = self.signatures if signature is None else [signature]
+        out = collections.OrderedDict()
+        for sig in signatures:
+            cres = self.overloads[sig]
+            ta = cres.type_annotation
+            key = (
+                ta.func_id.filename + ":" + str(ta.func_id.firstlineno + 1),
+                ta.signature,
+            )
+            out[key] = ta.annotate_raw()[key]
+        return out
+
+    def _explain_ambiguous(self, *args, **kws):
+        """
+        Callback for the C _Dispatcher object.
+        """
+        assert not kws, "kwargs not handled"
+        args = tuple([self.typeof_pyval(a) for a in args])
+        # The order here must be deterministic for testing purposes, which
+        # is ensured by the OrderedDict.
+        sigs = self.nopython_signatures
+        # This will raise
+        self.typingctx.resolve_overload(
+            self.py_func, sigs, args, kws, allow_ambiguous=False
+        )
+
+    def _explain_matching_error(self, *args, **kws):
+        """
+        Callback for the C _Dispatcher object.
+        """
+        assert not kws, "kwargs not handled"
+        args = [self.typeof_pyval(a) for a in args]
+        msg = "No matching definition for argument type(s) %s" % ", ".join(
+            map(str, args)
+        )
+        raise TypeError(msg)
+
+    def _search_new_conversions(self, *args, **kws):
+        """
+        Callback for the C _Dispatcher object.
+        Search for approximately matching signatures for the given arguments,
+        and ensure the corresponding conversions are registered in the C++
+        type manager.
+        """
+        assert not kws, "kwargs not handled"
+        args = [self.typeof_pyval(a) for a in args]
+        found = False
+        for sig in self.nopython_signatures:
+            conv = self.typingctx.install_possible_conversions(args, sig.args)
+            if conv:
+                found = True
+        return found
+
+    def __repr__(self):
+        return "%s(%s)" % (type(self).__name__, self.py_func)
+
+    def typeof_pyval(self, val):
+        """
+        Resolve the Numba type of Python value *val*.
+        This is called from numba._dispatcher as a fallback if the native code
+        cannot decide the type.
+        """
+        try:
+            tp = typeof(val, Purpose.argument)
+        except ValueError:
+            tp = types.pyobject
+        else:
+            if tp is None:
+                tp = types.pyobject
+        self._types_active_call.add(tp)
+        return tp
+
+    def _callback_add_timer(self, duration, cres, lock_name):
+        md = cres.metadata
+        # md can be None when code is loaded from cache
+        if md is not None:
+            timers = md.setdefault("timers", {})
+            if lock_name not in timers:
+                # Only write if the metadata does not exist
+                timers[lock_name] = duration
+            else:
+                msg = f"'{lock_name} metadata is already defined."
+                raise AssertionError(msg)
+
+    def _callback_add_compiler_timer(self, duration, cres):
+        return self._callback_add_timer(
+            duration, cres, lock_name="compiler_lock"
+        )
+
+    def _callback_add_llvm_timer(self, duration, cres):
+        return self._callback_add_timer(duration, cres, lock_name="llvm_lock")
+
+
 class _MemoMixin:
     __uuid = None
     # A {uuid -> instance} mapping, for deserialization
@@ -824,7 +1420,7 @@ def _compile_cached(self, args, return_type):
 
         try:
             retval = self._compile_core(args, return_type)
-        except TypingError as e:
+        except errors.TypingError as e:
             self._failed_cache[key] = e
             return False, e
         else:
@@ -1312,7 +1908,7 @@ def get_compile_result(self, sig):
                 self.compile(atypes)
             else:
                 msg = f"{sig} not available and compilation disabled"
-                raise TypingError(msg)
+                raise errors.TypingError(msg)
         return self.overloads[atypes]
 
     def recompile(self):

From bde712847bb9bf974779fda06f16a9bdc49f556f Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Mon, 4 Aug 2025 21:27:13 -0700
Subject: [PATCH 02/62] [Refactor][WIP] Vendoring in dispatcher cext for
 CUDA-specific changes

---
 numba_cuda/numba/cuda/__init__.py             |    1 +
 .../numba/cuda/_dispatcher/_devicearray.h     |   25 +
 .../numba/cuda/_dispatcher/_dispatcher.cpp    | 1667 +++++++++++++++++
 .../numba/cuda/_dispatcher/_hashtable.cpp     |  529 ++++++
 .../numba/cuda/_dispatcher/_hashtable.h       |  132 ++
 .../numba/cuda/_dispatcher/_numba_common.h    |   43 +
 numba_cuda/numba/cuda/_dispatcher/_pymodule.h |   35 +
 numba_cuda/numba/cuda/_dispatcher/_typeof.cpp | 1170 ++++++++++++
 numba_cuda/numba/cuda/_dispatcher/_typeof.h   |   16 +
 .../numba/cuda/_dispatcher/typeconv.cpp       |  209 +++
 .../numba/cuda/_dispatcher/typeconv.hpp       |   98 +
 numba_cuda/numba/cuda/dispatcher.py           |    2 +-
 pyproject.toml                                |    1 +
 setup.py                                      |  141 +-
 14 files changed, 4063 insertions(+), 6 deletions(-)
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_devicearray.h
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_hashtable.h
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_numba_common.h
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_pymodule.h
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/_typeof.h
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/typeconv.cpp
 create mode 100644 numba_cuda/numba/cuda/_dispatcher/typeconv.hpp

diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index 0465e1903..607ef2e09 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -1,5 +1,6 @@
 import importlib
 from numba.core import config
+from numba.core.config import ENABLE_SYS_MONITORING
 from .utils import _readenv
 import warnings
 
diff --git a/numba_cuda/numba/cuda/_dispatcher/_devicearray.h b/numba_cuda/numba/cuda/_dispatcher/_devicearray.h
new file mode 100644
index 000000000..5b276eacf
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_devicearray.h
@@ -0,0 +1,25 @@
+#ifndef NUMBA_DEVICEARRAY_H_
+#define NUMBA_DEVICEARRAY_H_
+
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+/* These definitions should only be used by consumers of the Device Array API.
+ * Consumers access the API through the opaque pointer stored in
+ * _devicearray._DEVICEARRAY_API.  We don't want these definitions in
+ * _devicearray.cpp itself because they would conflict with the actual
+ * implementations there.
+ */
+#ifndef NUMBA_IN_DEVICEARRAY_CPP_
+
+    extern void **DeviceArray_API;
+    #define DeviceArrayType (*(PyTypeObject*)DeviceArray_API[0])
+
+#endif /* ndef NUMBA_IN_DEVICEARRAY_CPP */
+
+#ifdef __cplusplus
+    }
+#endif
+
+#endif  /* NUMBA_DEVICEARRAY_H_ */
diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
new file mode 100644
index 000000000..5afdb2880
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
@@ -0,0 +1,1667 @@
+#include "_pymodule.h"
+
+#include <cstring>
+#include <ctime>
+#include <cassert>
+#include <vector>
+
+#include "_typeof.h"
+#include "frameobject.h"
+#include "traceback.h"
+#include "typeconv.hpp"
+#include "_devicearray.h"
+
+/*
+ * Notes on the C_TRACE macro:
+ *
+ * The original C_TRACE macro (from ceval.c) would call
+ * PyTrace_C_CALL et al., for which the frame argument wouldn't
+ * be usable. Since we explicitly synthesize a frame using the
+ * original Python code object, we call PyTrace_CALL instead so
+ * the profiler can report the correct source location.
+ *
+ * Likewise, while ceval.c would call PyTrace_C_EXCEPTION in case
+ * of error, the profiler would simply expect a RETURN in case of
+ * a Python function, so we generate that here (making sure the
+ * exception state is preserved correctly).
+ *
+ */
+
+#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+
+#ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+#endif
+#include "internal/pycore_frame.h"
+// This is a fix suggested in the comments in https://github.com/python/cpython/issues/108216
+// specifically https://github.com/python/cpython/issues/108216#issuecomment-1696565797
+#ifdef HAVE_STD_ATOMIC
+#  undef HAVE_STD_ATOMIC
+#endif
+#undef _PyGC_FINALIZED
+
+#if (PY_MINOR_VERSION == 12)
+    #include "internal/pycore_atomic.h"
+#endif
+#include "internal/pycore_interp.h"
+#include "internal/pycore_pyerrors.h"
+#include "internal/pycore_instruments.h"
+#include "internal/pycore_call.h"
+#include "cpython/code.h"
+
+#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11)
+#ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+#endif
+#include "internal/pycore_frame.h"
+#include "internal/pycore_pyerrors.h"
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L6804
+ */
+static int
+call_trace(Py_tracefunc func, PyObject *obj,
+           PyThreadState *tstate, PyFrameObject *frame,
+           int what, PyObject *arg)
+{
+    int result;
+    if (tstate->tracing) {
+        return 0;
+    }
+    if (frame == NULL) {
+        return -1;
+    }
+    int old_what = tstate->tracing_what;
+    tstate->tracing_what = what;
+    PyThreadState_EnterTracing(tstate);
+    result = func(obj, frame, what, NULL);
+    PyThreadState_LeaveTracing(tstate);
+    tstate->tracing_what = old_what;
+    return result;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240
+ */
+static int
+call_trace_protected(Py_tracefunc func, PyObject *obj,
+                     PyThreadState *tstate, PyFrameObject *frame,
+                     int what, PyObject *arg)
+{
+    PyObject *type, *value, *traceback;
+    int err;
+    _PyErr_Fetch(tstate, &type, &value, &traceback);
+    err = call_trace(func, obj, tstate, frame, what, arg);
+    if (err == 0)
+    {
+        _PyErr_Restore(tstate, type, value, traceback);
+        return 0;
+    }
+    else {
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(traceback);
+        return -1;
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L7245
+ * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521
+ * has been removed, it's dealt with in call_cfunc.
+ */
+#define C_TRACE(x, call, frame) \
+if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \
+    tstate, frame, \
+    PyTrace_CALL, cfunc)) { \
+    x = NULL; \
+} \
+else { \
+    x = call; \
+    if (tstate->c_profilefunc != NULL) { \
+        if (x == NULL) { \
+            call_trace_protected(tstate->c_profilefunc, \
+                tstate->c_profileobj, \
+                tstate, frame, \
+                PyTrace_RETURN, cfunc); \
+            /* XXX should pass (type, value, tb) */ \
+        } else { \
+            if (call_trace(tstate->c_profilefunc, \
+                tstate->c_profileobj, \
+                tstate, frame, \
+                PyTrace_RETURN, cfunc)) { \
+                Py_DECREF(x); \
+                x = NULL; \
+            } \
+        } \
+    } \
+} \
+
+#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10 || PY_MINOR_VERSION == 11)
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L36-L40
+ */
+typedef struct {
+    PyCodeObject *code; // The code object for the bounds. May be NULL.
+    PyCodeAddressRange bounds; // Only valid if code != NULL.
+    CFrame cframe;
+} PyTraceInfo;
+
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1257-L1266
+ * NOTE: The function is renamed.
+ */
+static void
+_nb_PyLineTable_InitAddressRange(const char *linetable, Py_ssize_t length, int firstlineno, PyCodeAddressRange *range)
+{
+    range->opaque.lo_next = linetable;
+    range->opaque.limit = range->opaque.lo_next + length;
+    range->ar_start = -1;
+    range->ar_end = 0;
+    range->opaque.computed_line = firstlineno;
+    range->ar_line = -1;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1269-L1275
+ * NOTE: The function is renamed.
+ */
+static int
+_nb_PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds)
+{
+    const char *linetable = PyBytes_AS_STRING(co->co_linetable);
+    Py_ssize_t length = PyBytes_GET_SIZE(co->co_linetable);
+    _nb_PyLineTable_InitAddressRange(linetable, length, co->co_firstlineno, bounds);
+    return bounds->ar_line;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5468-L5475
+ * NOTE: The call to _PyCode_InitAddressRange is renamed.
+ */
+static void
+initialize_trace_info(PyTraceInfo *trace_info, PyFrameObject *frame)
+{
+    if (trace_info->code != frame->f_code) {
+        trace_info->code = frame->f_code;
+        _nb_PyCode_InitAddressRange(frame->f_code, &trace_info->bounds);
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5477-L5501
+ */
+static int
+call_trace(Py_tracefunc func, PyObject *obj,
+           PyThreadState *tstate, PyFrameObject *frame,
+           PyTraceInfo *trace_info,
+           int what, PyObject *arg)
+{
+    int result;
+    if (tstate->tracing)
+        return 0;
+    tstate->tracing++;
+    tstate->cframe->use_tracing = 0;
+    if (frame->f_lasti < 0) {
+        frame->f_lineno = frame->f_code->co_firstlineno;
+    }
+    else {
+        initialize_trace_info(trace_info, frame);
+        frame->f_lineno = _PyCode_CheckLineNumber(frame->f_lasti*sizeof(_Py_CODEUNIT), &trace_info->bounds);
+    }
+    result = func(obj, frame, what, arg);
+    frame->f_lineno = 0;
+    tstate->cframe->use_tracing = ((tstate->c_tracefunc != NULL)
+                           || (tstate->c_profilefunc != NULL));
+    tstate->tracing--;
+    return result;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5445-L5466
+ */
+static int
+call_trace_protected(Py_tracefunc func, PyObject *obj,
+                     PyThreadState *tstate, PyFrameObject *frame,
+                     PyTraceInfo *trace_info,
+                     int what, PyObject *arg)
+{
+    PyObject *type, *value, *traceback;
+    int err;
+    PyErr_Fetch(&type, &value, &traceback);
+    err = call_trace(func, obj, tstate, frame, trace_info, what, arg);
+    if (err == 0)
+    {
+        PyErr_Restore(type, value, traceback);
+        return 0;
+    }
+    else
+    {
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(traceback);
+        return -1;
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5810-L5839
+ * NOTE: The state test https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5811
+ * has been removed, it's dealt with in call_cfunc.
+ */
+#define C_TRACE(x, call)                                        \
+if (call_trace(tstate->c_profilefunc, tstate->c_profileobj,     \
+               tstate, tstate->frame, &trace_info, PyTrace_CALL,\
+               cfunc))	                                        \
+    x = NULL;                                                   \
+else                                                            \
+{                                                               \
+    x = call;                                                   \
+    if (tstate->c_profilefunc != NULL)                          \
+    {                                                           \
+        if (x == NULL)                                          \
+        {                                                       \
+            call_trace_protected(tstate->c_profilefunc,         \
+                                 tstate->c_profileobj,          \
+                                 tstate, tstate->frame,         \
+                                 &trace_info,                   \
+                                 PyTrace_RETURN, cfunc);	\
+            /* XXX should pass (type, value, tb) */             \
+        }                                                       \
+        else                                                    \
+        {                                                       \
+            if (call_trace(tstate->c_profilefunc,               \
+                           tstate->c_profileobj,                \
+                           tstate, tstate->frame,               \
+                           &trace_info,                         \
+                           PyTrace_RETURN, cfunc))		\
+            {                                                   \
+                Py_DECREF(x);                                   \
+                x = NULL;                                       \
+            }                                                   \
+        }                                                       \
+    }                                                           \
+}
+
+#else  // Python <3.10
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4242-L4257
+ */
+static int
+call_trace(Py_tracefunc func, PyObject *obj,
+           PyThreadState *tstate, PyFrameObject *frame,
+           int what, PyObject *arg)
+{
+    int result;
+    if (tstate->tracing)
+        return 0;
+    tstate->tracing++;
+    tstate->use_tracing = 0;
+    result = func(obj, frame, what, arg);
+    tstate->use_tracing = ((tstate->c_tracefunc != NULL)
+                           || (tstate->c_profilefunc != NULL));
+    tstate->tracing--;
+    return result;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240
+ */
+static int
+call_trace_protected(Py_tracefunc func, PyObject *obj,
+                     PyThreadState *tstate, PyFrameObject *frame,
+                     int what, PyObject *arg)
+{
+    PyObject *type, *value, *traceback;
+    int err;
+    PyErr_Fetch(&type, &value, &traceback);
+    err = call_trace(func, obj, tstate, frame, what, arg);
+    if (err == 0)
+    {
+        PyErr_Restore(type, value, traceback);
+        return 0;
+    }
+    else
+    {
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(traceback);
+        return -1;
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4520-L4549
+ * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521
+ * has been removed, it's dealt with in call_cfunc.
+ */
+#define C_TRACE(x, call)                                        \
+if (call_trace(tstate->c_profilefunc, tstate->c_profileobj,     \
+               tstate, tstate->frame, PyTrace_CALL, cfunc))     \
+    x = NULL;                                                   \
+else                                                            \
+{                                                               \
+    x = call;                                                   \
+    if (tstate->c_profilefunc != NULL)                          \
+    {                                                           \
+        if (x == NULL)                                          \
+        {                                                       \
+            call_trace_protected(tstate->c_profilefunc,         \
+                                 tstate->c_profileobj,          \
+                                 tstate, tstate->frame,         \
+                                 PyTrace_RETURN, cfunc);        \
+            /* XXX should pass (type, value, tb) */             \
+        }                                                       \
+        else                                                    \
+        {                                                       \
+            if (call_trace(tstate->c_profilefunc,               \
+                           tstate->c_profileobj,                \
+                           tstate, tstate->frame,               \
+                           PyTrace_RETURN, cfunc))              \
+            {                                                   \
+                Py_DECREF(x);                                   \
+                x = NULL;                                       \
+            }                                                   \
+        }                                                       \
+    }                                                           \
+}
+
+
+#endif
+
+typedef std::vector<Type> TypeTable;
+typedef std::vector<PyObject*> Functions;
+
+/* The Dispatcher class is the base class of all dispatchers in the CPU and
+   CUDA targets. Its main responsibilities are:
+
+   - Resolving the best overload to call for a given set of arguments, and
+   - Calling the resolved overload.
+
+   This logic is implemented within this class for efficiency (lookup of the
+   appropriate overload needs to be fast) and ease of implementation (calling
+   directly into a compiled function using a function pointer is easier within
+   the C++ code where the overload has been resolved). */
+class Dispatcher {
+public:
+    PyObject_HEAD
+    /* Whether compilation of new overloads is permitted */
+    char can_compile;
+    /* Enable sys.monitoring (since Python 3.12+) */
+    char enable_sysmon;
+    /* Whether fallback to object mode is permitted */
+    char can_fallback;
+    /* Whether types must match exactly when resolving overloads.
+       If not, conversions (e.g. float32 -> float64) are permitted when
+       searching for a match. */
+    char exact_match_required;
+    /* Borrowed reference */
+    PyObject *fallbackdef;
+    /* Whether to fold named arguments and default values
+      (false for lifted loops) */
+    int fold_args;
+    /* Whether the last positional argument is a stararg */
+    int has_stararg;
+    /* Tuple of argument names */
+    PyObject *argnames;
+    /* Tuple of default values */
+    PyObject *defargs;
+    /* Number of arguments to function */
+    int argct;
+    /* Used for selecting overloaded function implementations */
+    TypeManager *tm;
+    /* An array of overloads */
+    Functions functions;
+    /* A flattened array of argument types to all overloads
+     * (invariant: sizeof(overloads) == argct * sizeof(functions)) */
+    TypeTable overloads;
+
+    /* Add a new overload. Parameters:
+
+       - args: An array of Type objects, one for each parameter
+       - callable: The callable implementing this overload. */
+    void addDefinition(Type args[], PyObject *callable) {
+        overloads.reserve(argct + overloads.size());
+        for (int i=0; i<argct; ++i) {
+            overloads.push_back(args[i]);
+        }
+        functions.push_back(callable);
+    }
+
+    /* Given a list of types, find the overloads that have a matching signature.
+       Returns the best match, as well as the number of matches found.
+
+       Parameters:
+
+       - sig: an array of Type objects, one for each parameter.
+       - matches: the number of matches found (mutated by this function).
+       - allow_unsafe: whether to match overloads that would require an unsafe
+                       cast.
+       - exact_match_required: Whether all arguments types must match the
+                               overload's types exactly. When false,
+                               overloads that would require a type conversion
+                               can also be matched. */
+    PyObject* resolve(Type sig[], int &matches, bool allow_unsafe,
+                      bool exact_match_required) const {
+        const int ovct = functions.size();
+        int selected;
+        matches = 0;
+        if (0 == ovct) {
+            // No overloads registered
+            return NULL;
+        }
+        if (argct == 0) {
+            // Nullary function: trivial match on first overload
+            matches = 1;
+            selected = 0;
+        }
+        else {
+            matches = tm->selectOverload(sig, &overloads[0], selected, argct,
+                                         ovct, allow_unsafe,
+                                         exact_match_required);
+        }
+        if (matches == 1) {
+            return functions[selected];
+        }
+        return NULL;
+    }
+
+    /* Remove all overloads */
+    void clear() {
+        functions.clear();
+        overloads.clear();
+    }
+
+};
+
+
+static int
+Dispatcher_traverse(Dispatcher *self, visitproc visit, void *arg)
+{
+    Py_VISIT(self->defargs);
+    return 0;
+}
+
+static void
+Dispatcher_dealloc(Dispatcher *self)
+{
+    Py_XDECREF(self->argnames);
+    Py_XDECREF(self->defargs);
+    self->clear();
+    Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+
+static int
+Dispatcher_init(Dispatcher *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *tmaddrobj;
+    void *tmaddr;
+    int argct;
+    int can_fallback;
+    int has_stararg = 0;
+    int exact_match_required = 0;
+
+    if (!PyArg_ParseTuple(args, "OiiO!O!i|ii", &tmaddrobj, &argct,
+                          &self->fold_args,
+                          &PyTuple_Type, &self->argnames,
+                          &PyTuple_Type, &self->defargs,
+                          &can_fallback,
+                          &has_stararg,
+                          &exact_match_required
+                         )) {
+        return -1;
+    }
+    Py_INCREF(self->argnames);
+    Py_INCREF(self->defargs);
+    tmaddr = PyLong_AsVoidPtr(tmaddrobj);
+    self->tm = static_cast<TypeManager*>(tmaddr);
+    self->argct = argct;
+    self->can_compile = 1;
+    self->enable_sysmon = 0;  // default to turn off sys.monitoring
+    self->can_fallback = can_fallback;
+    self->fallbackdef = NULL;
+    self->has_stararg = has_stararg;
+    self->exact_match_required = exact_match_required;
+    return 0;
+}
+
+static PyObject *
+Dispatcher_clear(Dispatcher *self, PyObject *args)
+{
+    self->clear();
+    Py_RETURN_NONE;
+}
+
+static
+PyObject*
+Dispatcher_Insert(Dispatcher *self, PyObject *args, PyObject *kwds)
+{
+    /* The cuda kwarg is a temporary addition until CUDA overloads are compiled
+     * functions. Once they are compiled functions, kwargs can be removed from
+     * this function. */
+    static char *keywords[] = {
+        (char*)"sig",
+        (char*)"func",
+        (char*)"objectmode",
+        (char*)"cuda",
+        NULL
+    };
+
+    PyObject *sigtup, *cfunc;
+    int i, sigsz;
+    int *sig;
+    int objectmode = 0;
+    int cuda = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|ip", keywords, &sigtup,
+                                     &cfunc, &objectmode, &cuda)) {
+        return NULL;
+    }
+
+    if (!cuda && !PyObject_TypeCheck(cfunc, &PyCFunction_Type) ) {
+        PyErr_SetString(PyExc_TypeError, "must be builtin_function_or_method");
+        return NULL;
+    }
+
+    sigsz = PySequence_Fast_GET_SIZE(sigtup);
+    sig = new int[sigsz];
+
+    for (i = 0; i < sigsz; ++i) {
+        sig[i] = PyLong_AsLong(PySequence_Fast_GET_ITEM(sigtup, i));
+    }
+
+    /* The reference to cfunc is borrowed; this only works because the
+       derived Python class also stores an (owned) reference to cfunc. */
+    self->addDefinition(sig, cfunc);
+
+    /* Add pure python fallback */
+    if (!self->fallbackdef && objectmode){
+        self->fallbackdef = cfunc;
+    }
+
+    delete[] sig;
+
+    Py_RETURN_NONE;
+}
+
+static
+void explain_issue(PyObject *dispatcher, PyObject *args, PyObject *kws,
+                   const char *method_name, const char *default_msg)
+{
+    PyObject *callback, *result;
+    callback = PyObject_GetAttrString(dispatcher, method_name);
+    if (!callback) {
+        PyErr_SetString(PyExc_TypeError, default_msg);
+        return;
+    }
+    result = PyObject_Call(callback, args, kws);
+    Py_DECREF(callback);
+    if (result != NULL) {
+        PyErr_Format(PyExc_RuntimeError, "%s must raise an exception",
+                     method_name);
+        Py_DECREF(result);
+    }
+}
+
+static
+void explain_ambiguous(PyObject *dispatcher, PyObject *args, PyObject *kws)
+{
+    explain_issue(dispatcher, args, kws, "_explain_ambiguous",
+                  "Ambiguous overloading");
+}
+
+static
+void explain_matching_error(PyObject *dispatcher, PyObject *args, PyObject *kws)
+{
+    explain_issue(dispatcher, args, kws, "_explain_matching_error",
+                  "No matching definition");
+}
+
+static
+int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws)
+{
+    PyObject *callback, *result;
+    int res;
+
+    callback = PyObject_GetAttrString(dispatcher,
+                                      "_search_new_conversions");
+    if (!callback) {
+        return -1;
+    }
+    result = PyObject_Call(callback, args, kws);
+    Py_DECREF(callback);
+    if (result == NULL) {
+        return -1;
+    }
+    if (!PyBool_Check(result)) {
+        Py_DECREF(result);
+        PyErr_SetString(PyExc_TypeError,
+                        "_search_new_conversions() should return a boolean");
+        return -1;
+    }
+    res = (result == Py_True) ? 1 : 0;
+    Py_DECREF(result);
+    return res;
+}
+
+
+#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11))
+
+/* A custom, fast, inlinable version of PyCFunction_Call() */
+static PyObject *
+call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals)
+{
+    PyCFunctionWithKeywords fn;
+    PyThreadState *tstate;
+
+    assert(PyCFunction_Check(cfunc));
+    assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS));
+    fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc);
+    tstate = PyThreadState_GET();
+
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11)
+    /*
+     * On Python 3.11, _PyEval_EvalFrameDefault stops using PyTraceInfo since
+     * it's now baked into ThreadState.
+     * https://github.com/python/cpython/pull/26623
+     */
+    if (tstate->cframe->use_tracing && tstate->c_profilefunc)
+#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10)
+    /*
+     * On Python 3.10+ trace_info comes from somewhere up in PyFrameEval et al,
+     * Numba doesn't have access to that so creates an equivalent struct and
+     * wires it up against the cframes. This is passed into the tracing
+     * functions.
+     *
+     * Code originally from:
+     * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L1611-L1622
+     */
+    PyTraceInfo trace_info;
+    trace_info.code = NULL; // not initialized
+    CFrame *prev_cframe = tstate->cframe;
+    trace_info.cframe.use_tracing = prev_cframe->use_tracing;
+    trace_info.cframe.previous = prev_cframe;
+
+    if (trace_info.cframe.use_tracing && tstate->c_profilefunc)
+#else
+    /*
+     * On Python prior to 3.10, tracing state is a member of the threadstate
+     */
+    if (tstate->use_tracing && tstate->c_profilefunc)
+#endif
+    {
+        /*
+         * The following code requires some explaining:
+         *
+         * We want the jit-compiled function to be visible to the profiler, so we
+         * need to synthesize a frame for it.
+         * The PyFrame_New() constructor doesn't do anything with the 'locals' value if the 'code's
+         * 'CO_NEWLOCALS' flag is set (which is always the case nowadays).
+         * So, to get local variables into the frame, we have to manually set the 'f_locals'
+         * member, then call `PyFrame_LocalsToFast`, where a subsequent call to the `frame.f_locals`
+         * property (by virtue of the `frame_getlocals` function in frameobject.c) will find them.
+         */
+        PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__");
+        PyObject *globals = PyDict_New();
+        PyObject *builtins = PyEval_GetBuiltins();
+        PyFrameObject *frame = NULL;
+        PyObject *result = NULL;
+#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10))
+        // Only used in 3.10, to help with saving/restoring exception state
+        PyObject *pyexc = NULL;
+        PyObject *err_type = NULL;
+        PyObject *err_value = NULL;
+        PyObject *err_traceback = NULL;
+#endif
+
+        if (!code) {
+            PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found.");
+            goto error;
+        }
+        /* Populate builtins, which is required by some JITted functions */
+        if (PyDict_SetItemString(globals, "__builtins__", builtins)) {
+            goto error;
+        }
+
+        /* unset the CO_OPTIMIZED flag, make the frame get a new locals dict */
+        code->co_flags &= 0xFFFE;
+
+        frame = PyFrame_New(tstate, code, globals, locals);
+        if (frame == NULL) {
+            goto error;
+        }
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11)
+        // Python 3.11 improved the frame infrastructure such that frames are
+        // updated by the virtual machine, no need to do PyFrame_LocalsToFast
+        // and PyFrame_FastToLocals to ensure `frame->f_locals` is consistent.
+        C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws), frame);
+#else
+        // Populate the 'fast locals' in `frame`
+        PyFrame_LocalsToFast(frame, 0);
+        tstate->frame = frame;
+
+        // make the call
+        C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws));
+
+        // write changes back to locals?
+        // PyFrame_FastToLocals can clear the exception indicator, therefore
+        // this state needs saving and restoring across the call if the
+        // exception indicator is set.
+        pyexc = PyErr_Occurred();
+        if (pyexc != NULL) {
+            PyErr_Fetch(&err_type, &err_value, &err_traceback);
+        }
+        PyFrame_FastToLocals(frame);
+        if (pyexc != NULL) {
+            PyErr_Restore(err_type, err_value, err_traceback);
+        }
+        tstate->frame = frame->f_back;
+#endif
+    error:
+        Py_XDECREF(frame);
+        Py_XDECREF(globals);
+        Py_XDECREF(code);
+        return result;
+    }
+    else
+    {
+        return fn(PyCFunction_GET_SELF(cfunc), args, kws);
+    }
+}
+
+#elif (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+
+// Python 3.12 has a completely new approach to tracing and profiling due to
+// the new `sys.monitoring` system.
+
+// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L863-L868
+
+static const int8_t MOST_SIG_BIT[16] = {-1, 0, 1, 1,
+                                         2, 2, 2, 2,
+                                         3, 3, 3, 3,
+                                         3, 3, 3, 3};
+
+// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L873-L879
+
+static inline int msb(uint8_t bits) {
+    if (bits > 15) {
+        return MOST_SIG_BIT[bits>>4]+4;
+    }
+    return MOST_SIG_BIT[bits];
+}
+
+
+static int invoke_monitoring(PyThreadState * tstate, int event, Dispatcher *self, PyObject* retval)
+{
+    // This will invoke monitoring tools (if present) for the event `event`.
+    //
+    // Arguments:
+    //   tstate - the interpreter thread state
+    //   event - an event as defined in internal/pycore_instruments.h
+    //   self - the dispatcher
+    //   retval - the return value from running the dispatcher machine code (if needed)
+    //            or NULL if not needed.
+    //
+    // Return:
+    // status 0 for success -1 otherwise.
+    //
+    // Notes:
+    // Python 3.12 has a new monitoring system as described in PEP 669. It's
+    // largely implemented in CPython PR #103083.
+    //
+    // This PEP manifests as a set of monitoring instrumentation in the form of
+    // per-monitoring-tool-type callbacks stored as part of the interpreter
+    // state (can also be on the code object for "local events" but Numba
+    // doesn't support those, see the Numba developer docs). From the Python
+    // interpreter this appears as `sys.monitoring`, from the C-side there's not
+    // a great deal of public API for the sort of things that Numba wants/needs
+    // to do.
+    //
+    // The new monitoring system is event based, the general idea in the
+    // following code is to see if a monitoring "tool" has registered a callback
+    // to run on the presence of a particular event and run those callbacks if
+    // so. In Numba's case we're just about to disappear into machine code
+    // that's essentially doing the same thing as the interpreter would if it
+    // executed the bytecode present in the function that's been JIT compiled.
+    // As a result we need to tell any tool that has a callback registered for a
+    // PY_MONITORING_EVENT_PY_START that a Python function is about to start
+    // (and do something similar for when a function returns/raises).
+    // This is a total lie as the execution is in machine code, but telling this
+    // lie makes it look like a python function has started executing at the
+    // point the machine code function starts and tools like profilers will be
+    // able to identify this and do something appropriate. The "lie" is very
+    // much like lie told for Python < 3.12, but the format of the lie is
+    // different. There is no fake frame involved, it's just about calling an
+    // appropriate call back, which in a way is a lot less confusing to deal
+    // with.
+    //
+    // For reference, under cProfile all these are NULL, don't even look at
+    // them, they are legacy, you need to use the monitoring system!
+    // tstate->c_profilefunc
+    // tstate->c_profileobj
+    // tstate->c_tracefunc
+    // tstate->c_traceobj
+    //
+    // Finally: Useful places to look in the CPython code base:
+    // 1. internal/pycore_instruments.h which has the #defines for all the event
+    // types and the "types" of tools e.g. debugger, profiler.
+    // 2. Python/instrumentation.c which is where most of the implementation is
+    // done. Particularly functions `call_instrumentation_vector` and
+    // `call_one_instrument`.
+    // Note that Python/legacy_tracing.c is not somewhere to look, it's just
+    // wiring old style tracing that has been setup via e.g. C-API
+    // PyEval_SetProfile into the new monitoring system.
+    //
+    // Other things...
+    // 1. Calls to `sys.monitoring.set_events` clobber the previous state.
+    // 2. You can register callbacks for an event without having the event set.
+    // 3. You can set events and have no associated callback.
+    // 4. Tools are supposed to be respectful of other tools that are
+    //    registered, i.e. not clobber/interfere with each other.
+    // 5. There are multiple slots for tools, cProfile is a profiler and
+    //    profilers should register in slot 2 by convention.
+    //
+    // This is useful for debug:
+    // To detect whether Python is doing _any_ monitoring it's necessary to
+    // inspect the per-thread state interpreter monitors.tools member, its a
+    // uchar[15]. A non-zero value in any tools slot suggests something
+    // is registered to be called on the occurence of some event.
+    //
+    // bool monitoring_tools_present = false;
+    // for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) {
+    //     if (tstate->interp->monitors.tools[i]) {
+    //         monitoring_tools_present = true;
+    //         break;
+    //     }
+    // }
+
+    // The code in this function is based loosely on a combination of the
+    // following:
+    // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L945-L1008
+    // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L1010-L1026
+    // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L839-L861
+
+    // TODO: check this, call_instrumentation_vector has this at the top.
+    if (tstate->tracing){
+        return 0;
+    }
+
+    // Are there any tools set on this thead for this event?
+    uint8_t tools = tstate->interp->monitors.tools[event];
+    // offset value for use in callbacks
+    PyObject * offset_obj = NULL;
+    // callback args slots (used in vectorcall protocol)
+    PyObject * callback_args[3] = {NULL, NULL, NULL};
+
+    // If so...
+    if (tools)
+    {
+
+
+        PyObject *result = NULL;
+        PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__"); // incref code
+        if (!code) {
+            PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found.");
+            return -1;
+        }
+
+        // TODO: handle local events, see `get_tools_for_instruction`.
+        // The issue with local events is that they maybe don't make a lot of
+        // sense in a JIT context. The way it works is that
+        // `sys.monitoring.set_local_events` takes the code object of a function
+        // and "instruments" it with respect to the requested events. In
+        // practice this seems to materialise as swapping bytecodes associated
+        // with the event bitmask for `INSTRUMENTED_` variants of those
+        // bytecodes. Then at interpretation time if an instrumented instruction
+        // is encountered it triggers lookups in the `code->_co_monitoring`
+        // struct for tools and active monitors etc. In Numba we _know_ the
+        // bytecode at which the code starts and we can probably scrape the code
+        // to look for instrumented return instructions, so it is feasible to
+        // support at least PY_START and PY_RETURN events, however, it's a lot
+        // of effort for perhaps something that's practically not that useful.
+        // As a result, only global events are supported at present.
+
+        // This is supposed to be the offset of the
+        // currently-being-interpreted bytecode instruction. In Numba's case
+        // there is no bytecode executing. We know that for a PY_START event
+        // that the offset is probably zero (it might be 2 if there's a
+        // closure, it's whereever the `RESUME` bytecode appears). However,
+        // we don't know which bytecode will be associated with the return
+        // (without huge effort to wire that through to here). Therefore
+        // zero is also used for return/raise/unwind, the main use case,
+        // cProfile, seems to manage to do something sensible even though this
+        // is inaccurate.
+        offset_obj = PyLong_FromSsize_t(0); // incref offset_obj
+
+        // This is adapted from call_one_instrument. Note that Numba has to care
+        // about all events even though it only emits fake events for PY_START,
+        // PY_RETURN, RAISE and PY_UNWIND, this is because of the ability of
+        // `objmode` to call back into the interpreter and essentially create a
+        // continued Python execution environment/stack from there.
+        while(tools) {
+            // The tools registered are set as bits in `tools` and provide an
+            // index into monitoring_callables. This is presumably used by
+            // cPython to detect if the slot of a tool type is already in use so
+            // that a user can't register more than one tool of a given type at
+            // the same time.
+            int tool = msb(tools);
+            tools ^= (1 << tool);
+            // Get the instrument at offset `tool` for the event of interest,
+            // this is a callback function, it also might not be present! It
+            // is entirely legitimate to have events that have no callback
+            // and callbacks that have no event. This is to make it relatively
+            // easy to switch events on and off and ensure that monitoring is
+            // "lightweight".
+            PyObject * instrument = (PyObject *)tstate->interp->monitoring_callables[tool][event];
+            if (instrument == NULL){
+                continue;
+            }
+
+            // Swap the threadstate "event" for the event of interest and
+            // increment the tracing tracking field (essentially, inlined
+            // PyThreadState_EnterTracing).
+            int old_what = tstate->what_event;
+            tstate->what_event = event;
+            tstate->tracing++;
+
+            // Need to call the callback instrument. Need to know the number of
+            // arguments, this is based on whether the `retval` (return value)
+            // is NULL (it indicates whether this is a PY_START, or something
+            // like a PY_RETURN, which has 3 arguments).
+            size_t nargsf = (retval == NULL ? 2 : 3) | PY_VECTORCALL_ARGUMENTS_OFFSET;
+
+            // call the instrumentation, look at the args to the callback
+            // functions for sys.monitoring events to find out what the
+            // arguments are. e.g.
+            // PY_START has `func(code: CodeType, instruction_offset: int)`
+            // whereas
+            // PY_RETURN has `func(code: CodeType, instruction_offset: int, retval: object)`
+            // and
+            // CALL, C_RAISE, C_RETURN has `func(code: CodeType, instruction_offset: int, callable: object, arg0 object|MISSING)`
+            // i.e. the signature changes based on context. This influences the
+            // value of `nargsf` and what is wired into `callback_args`. First two
+            // arguments are always code and offset, optional third arg is
+            // the return value.
+            callback_args[0] = (PyObject*)code;
+            callback_args[1] = (PyObject*)offset_obj;
+            callback_args[2] = (PyObject*)retval;
+            PyObject ** callargs = &callback_args[0];
+
+            // finally, stage the call the the instrument
+            result = PyObject_Vectorcall(instrument, callargs, nargsf, NULL);
+
+            // decrement the tracing tracking field and set the event back to
+            // the original event (essentially, inlined
+            // PyThreadState_LeaveTracing).
+            tstate->tracing--;
+            tstate->what_event = old_what;
+
+            if (result == NULL){
+                // Error occurred in call to instrumentation.
+                Py_XDECREF(offset_obj);
+                Py_XDECREF(code);
+                return -1;
+            }
+        }
+        Py_XDECREF(offset_obj);
+        Py_XDECREF(code);
+    }
+    return 0;
+}
+
+/* invoke monitoring for PY_START if it is set */
+int static inline invoke_monitoring_PY_START(PyThreadState * tstate, Dispatcher *self) {
+    return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_START, self, NULL);
+}
+
+/* invoke monitoring for PY_RETURN if it is set */
+int static inline invoke_monitoring_PY_RETURN(PyThreadState * tstate, Dispatcher *self, PyObject * retval) {
+    return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_RETURN, self, retval);
+}
+
+/* invoke monitoring for RAISE if it is set */
+int static inline invoke_monitoring_RAISE(PyThreadState * tstate, Dispatcher *self, PyObject * exception) {
+    return invoke_monitoring(tstate, PY_MONITORING_EVENT_RAISE, self, exception);
+}
+
+/* invoke monitoring for PY_UNWIND if it is set */
+int static inline invoke_monitoring_PY_UNWIND(PyThreadState * tstate, Dispatcher *self, PyObject * exception) {
+    return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_UNWIND, self, exception);
+}
+
+/* forward declaration */
+bool static is_sysmon_enabled(Dispatcher *self);
+
+/* A custom, fast, inlinable version of PyCFunction_Call() */
+static PyObject *
+call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals)
+{
+    PyCFunctionWithKeywords fn = NULL;
+    PyThreadState *tstate = NULL;
+    PyObject * pyresult = NULL;
+    PyObject * pyexception = NULL;
+    const bool enabled_sysmon = is_sysmon_enabled(self);
+
+    assert(PyCFunction_Check(cfunc));
+    assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS));
+    fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc);
+    tstate = PyThreadState_GET();
+    // issue PY_START if event is set
+    if(enabled_sysmon && invoke_monitoring_PY_START(tstate, self) != 0){
+        return NULL;
+    }
+    // make call
+    pyresult = fn(PyCFunction_GET_SELF(cfunc), args, kws);
+    if (enabled_sysmon && pyresult == NULL) {
+        // pyresult == NULL, which means the Numba function raised an exception
+        // which is now pending.
+        //
+        // NOTE: that _ALL_ exceptions trigger the RAISE event, even a
+        // StopIteration exception. To get a STOP_ITERATION event, the
+        // StopIteration exception must be "implied" i.e. a for loop exhausting
+        // a generator, whereas those coming from the executing the binary
+        // wrapped in this dispatcher must always be explicit (this is after all
+        // a function dispatcher).
+        //
+        // NOTE: That it is necessary to trigger both a `RAISE` event, as this
+        // triggered by an exception being raised, and a `PY_UNWIND` event, as
+        // this is the event for  "exiting from a python function during
+        // exception unwinding" (see CPython sys.monitoring docs).
+        //
+        // In the following, if the call to PyErr_GetRaisedException returns
+        // NULL, it means that something has cleared the error indicator and
+        // this is a most surprising state to occur (shouldn't be possible!).
+        //
+        // TODO: This makes the exception raising path a little slower as the
+        // exception state is suspended and resumed regardless of whether
+        // monitoring for such an event is set. In future it might be worth
+        // checking the tstate->interp->monitors.tools[event] and only doing the
+        // suspend/resume if something is listening for the event.
+        pyexception = PyErr_GetRaisedException();
+        if (pyexception != NULL) {
+            if(invoke_monitoring_RAISE(tstate, self, pyexception) != 0){
+                // If the monitoring callback raised, return NULL so that the
+                // exception can propagate.
+                return NULL;
+            }
+            if(invoke_monitoring_PY_UNWIND(tstate, self, pyexception) != 0){
+                // If the monitoring callback raised, return NULL so that the
+                // exception can propagate.
+                return NULL;
+            }
+            // reset the exception
+            PyErr_SetRaisedException(pyexception);
+        }
+        // Exception in Numba call as pyresult == NULL, start to unwind by
+        // returning NULL.
+        return NULL;
+    }
+    // issue PY_RETURN if event is set
+    if(enabled_sysmon && invoke_monitoring_PY_RETURN(tstate, self, pyresult) != 0){
+        return NULL;
+    }
+    return pyresult;
+}
+#else
+#error "Python version is not supported."
+#endif
+
+
+static
+PyObject*
+compile_and_invoke(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals)
+{
+    /* Compile a new one */
+    PyObject *cfa, *cfunc, *retval;
+    cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args");
+    if (cfa == NULL)
+        return NULL;
+
+    /* NOTE: we call the compiled function ourselves instead of
+       letting the Python derived class do it.  This is for proper
+       behaviour of globals() in jitted functions (issue #476). */
+    cfunc = PyObject_Call(cfa, args, kws);
+    Py_DECREF(cfa);
+
+    if (cfunc == NULL)
+        return NULL;
+
+    if (PyObject_TypeCheck(cfunc, &PyCFunction_Type)) {
+        retval = call_cfunc(self, cfunc, args, kws, locals);
+    } else {
+        /* Re-enter interpreter */
+        retval = PyObject_Call(cfunc, args, kws);
+    }
+    Py_DECREF(cfunc);
+
+    return retval;
+}
+
+/* A copy of compile_and_invoke, that only compiles. This is needed for CUDA
+ * kernels, because its overloads are Python instances of the _Kernel class,
+ * rather than compiled functions. Once CUDA overloads are compiled functions,
+ * cuda_compile_only can be removed. */
+static
+PyObject*
+cuda_compile_only(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals)
+{
+    /* Compile a new one */
+    PyObject *cfa, *cfunc;
+    cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args");
+    if (cfa == NULL)
+        return NULL;
+
+    cfunc = PyObject_Call(cfa, args, kws);
+    Py_DECREF(cfa);
+
+    return cfunc;
+}
+
+static int
+find_named_args(Dispatcher *self, PyObject **pargs, PyObject **pkws)
+{
+    PyObject *oldargs = *pargs, *newargs;
+    PyObject *kws = *pkws;
+    Py_ssize_t pos_args = PyTuple_GET_SIZE(oldargs);
+    Py_ssize_t named_args, total_args, i;
+    Py_ssize_t func_args = PyTuple_GET_SIZE(self->argnames);
+    Py_ssize_t defaults = PyTuple_GET_SIZE(self->defargs);
+    /* Last parameter with a default value */
+    Py_ssize_t last_def = (self->has_stararg)
+                          ? func_args - 2
+                          : func_args - 1;
+    /* First parameter with a default value */
+    Py_ssize_t first_def = last_def - defaults + 1;
+    /* Minimum number of required arguments */
+    Py_ssize_t minargs = first_def;
+
+    if (kws != NULL)
+        named_args = PyDict_Size(kws);
+    else
+        named_args = 0;
+    total_args = pos_args + named_args;
+    if (!self->has_stararg && total_args > func_args) {
+        PyErr_Format(PyExc_TypeError,
+                     "too many arguments: expected %d, got %d",
+                     (int) func_args, (int) total_args);
+        return -1;
+    }
+    else if (total_args < minargs) {
+        if (minargs == func_args)
+            PyErr_Format(PyExc_TypeError,
+                         "not enough arguments: expected %d, got %d",
+                         (int) minargs, (int) total_args);
+        else
+            PyErr_Format(PyExc_TypeError,
+                         "not enough arguments: expected at least %d, got %d",
+                         (int) minargs, (int) total_args);
+        return -1;
+    }
+    newargs = PyTuple_New(func_args);
+    if (!newargs)
+        return -1;
+    /* First pack the stararg */
+    if (self->has_stararg) {
+        Py_ssize_t stararg_size = Py_MAX(0, pos_args - func_args + 1);
+        PyObject *stararg = PyTuple_New(stararg_size);
+        if (!stararg) {
+            Py_DECREF(newargs);
+            return -1;
+        }
+        for (i = 0; i < stararg_size; i++) {
+            PyObject *value = PyTuple_GET_ITEM(oldargs, func_args - 1 + i);
+            Py_INCREF(value);
+            PyTuple_SET_ITEM(stararg, i, value);
+        }
+        /* Put it in last position */
+        PyTuple_SET_ITEM(newargs, func_args - 1, stararg);
+
+    }
+    for (i = 0; i < pos_args; i++) {
+        PyObject *value = PyTuple_GET_ITEM(oldargs, i);
+        if (self->has_stararg && i >= func_args - 1) {
+            /* Skip stararg */
+            break;
+        }
+        Py_INCREF(value);
+        PyTuple_SET_ITEM(newargs, i, value);
+    }
+
+    /* Iterate over missing positional arguments, try to find them in
+       named arguments or default values. */
+    for (i = pos_args; i < func_args; i++) {
+        PyObject *name = PyTuple_GET_ITEM(self->argnames, i);
+        if (self->has_stararg && i >= func_args - 1) {
+            /* Skip stararg */
+            break;
+        }
+        if (kws != NULL) {
+            /* Named argument? */
+            PyObject *value = PyDict_GetItem(kws, name);
+            if (value != NULL) {
+                Py_INCREF(value);
+                PyTuple_SET_ITEM(newargs, i, value);
+                named_args--;
+                continue;
+            }
+        }
+        if (i >= first_def && i <= last_def) {
+            /* Argument has a default value? */
+            PyObject *value = PyTuple_GET_ITEM(self->defargs, i - first_def);
+            Py_INCREF(value);
+            PyTuple_SET_ITEM(newargs, i, value);
+            continue;
+        }
+        else if (i < func_args - 1 || !self->has_stararg) {
+            PyErr_Format(PyExc_TypeError,
+                         "missing argument '%s'",
+                         PyString_AsString(name));
+            Py_DECREF(newargs);
+            return -1;
+        }
+    }
+    if (named_args) {
+        PyErr_Format(PyExc_TypeError,
+                     "some keyword arguments unexpected");
+        Py_DECREF(newargs);
+        return -1;
+    }
+    *pargs = newargs;
+    *pkws = NULL;
+    return 0;
+}
+
+static PyObject*
+Dispatcher_call(Dispatcher *self, PyObject *args, PyObject *kws)
+{
+    PyObject *tmptype, *retval = NULL;
+    int *tys = NULL;
+    int argct;
+    int i;
+    int prealloc[24];
+    int matches;
+    PyObject *cfunc;
+    PyThreadState *ts = PyThreadState_Get();
+    PyObject *locals = NULL;
+
+    /* If compilation is enabled, ensure that an exact match is found and if
+     * not compile one */
+    int exact_match_required = self->can_compile ? 1 : self->exact_match_required;
+
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10)
+    if (ts->tracing && ts->c_profilefunc) {
+#else
+    if (ts->use_tracing && ts->c_profilefunc) {
+#endif
+        locals = PyEval_GetLocals();
+        if (locals == NULL) {
+            goto CLEANUP;
+        }
+    }
+    if (self->fold_args) {
+        if (find_named_args(self, &args, &kws))
+            return NULL;
+    }
+    else
+        Py_INCREF(args);
+    /* Now we own a reference to args */
+
+    argct = PySequence_Fast_GET_SIZE(args);
+
+    if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int)))
+        tys = prealloc;
+    else
+        tys = new int[argct];
+
+    for (i = 0; i < argct; ++i) {
+        tmptype = PySequence_Fast_GET_ITEM(args, i);
+        tys[i] = typeof_typecode((PyObject *) self, tmptype);
+        if (tys[i] == -1) {
+            if (self->can_fallback){
+                /* We will clear the exception if fallback is allowed. */
+                PyErr_Clear();
+            } else {
+                goto CLEANUP;
+            }
+        }
+    }
+
+    /* We only allow unsafe conversions if compilation of new specializations
+       has been disabled.
+
+       Note that the number of matches is returned in matches by resolve, which
+       accepts it as a reference. */
+    cfunc = self->resolve(tys, matches, !self->can_compile,
+                          exact_match_required);
+
+    if (matches == 0 && !self->can_compile) {
+        /*
+         * If we can't compile a new specialization, look for
+         * matching signatures for which conversions haven't been
+         * registered on the C++ TypeManager.
+         */
+        int res = search_new_conversions((PyObject *) self, args, kws);
+        if (res < 0) {
+            retval = NULL;
+            goto CLEANUP;
+        }
+        if (res > 0) {
+            /* Retry with the newly registered conversions */
+            cfunc = self->resolve(tys, matches, !self->can_compile,
+                                  exact_match_required);
+        }
+    }
+    if (matches == 1) {
+        /* Definition is found */
+        retval = call_cfunc(self, cfunc, args, kws, locals);
+    } else if (matches == 0) {
+        /* No matching definition */
+        if (self->can_compile) {
+            retval = compile_and_invoke(self, args, kws, locals);
+        } else if (self->fallbackdef) {
+            /* Have object fallback */
+            retval = call_cfunc(self, self->fallbackdef, args, kws, locals);
+        } else {
+            /* Raise TypeError */
+            explain_matching_error((PyObject *) self, args, kws);
+            retval = NULL;
+        }
+    } else if (self->can_compile) {
+        /* Ambiguous, but are allowed to compile */
+        retval = compile_and_invoke(self, args, kws, locals);
+    } else {
+        /* Ambiguous */
+        explain_ambiguous((PyObject *) self, args, kws);
+        retval = NULL;
+    }
+
+CLEANUP:
+    if (tys != prealloc)
+        delete[] tys;
+    Py_DECREF(args);
+
+    return retval;
+}
+
+/* Based on Dispatcher_call above, with the following differences:
+   1. It does not invoke the definition of the function.
+   2. It returns the definition, instead of a value returned by the function.
+
+   This is because CUDA functions are, at present, _Kernel objects rather than
+   compiled functions. */
+static PyObject*
+Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws)
+{
+    PyObject *tmptype, *retval = NULL;
+    int *tys = NULL;
+    int argct;
+    int i;
+    int prealloc[24];
+    int matches;
+    PyObject *cfunc;
+    PyThreadState *ts = PyThreadState_Get();
+    PyObject *locals = NULL;
+
+    /* If compilation is enabled, ensure that an exact match is found and if
+     * not compile one */
+    int exact_match_required = self->can_compile ? 1 : self->exact_match_required;
+
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10)
+    if (ts->tracing && ts->c_profilefunc) {
+#else
+    if (ts->use_tracing && ts->c_profilefunc) {
+#endif
+        locals = PyEval_GetLocals();
+        if (locals == NULL) {
+            goto CLEANUP;
+        }
+    }
+    if (self->fold_args) {
+        if (find_named_args(self, &args, &kws))
+            return NULL;
+    }
+    else
+        Py_INCREF(args);
+    /* Now we own a reference to args */
+
+    argct = PySequence_Fast_GET_SIZE(args);
+
+    if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int)))
+        tys = prealloc;
+    else
+        tys = new int[argct];
+
+    for (i = 0; i < argct; ++i) {
+        tmptype = PySequence_Fast_GET_ITEM(args, i);
+        tys[i] = typeof_typecode((PyObject *) self, tmptype);
+        if (tys[i] == -1) {
+            if (self->can_fallback){
+                /* We will clear the exception if fallback is allowed. */
+                PyErr_Clear();
+            } else {
+                goto CLEANUP;
+            }
+        }
+    }
+
+    /* We only allow unsafe conversions if compilation of new specializations
+       has been disabled. */
+    cfunc = self->resolve(tys, matches, !self->can_compile,
+                          exact_match_required);
+
+    if (matches == 0 && !self->can_compile) {
+        /*
+         * If we can't compile a new specialization, look for
+         * matching signatures for which conversions haven't been
+         * registered on the C++ TypeManager.
+         */
+        int res = search_new_conversions((PyObject *) self, args, kws);
+        if (res < 0) {
+            retval = NULL;
+            goto CLEANUP;
+        }
+        if (res > 0) {
+            /* Retry with the newly registered conversions */
+            cfunc = self->resolve(tys, matches, !self->can_compile,
+                                  exact_match_required);
+        }
+    }
+
+    if (matches == 1) {
+        /* Definition is found */
+        retval = cfunc;
+        Py_INCREF(retval);
+    } else if (matches == 0) {
+        /* No matching definition */
+        if (self->can_compile) {
+            retval = cuda_compile_only(self, args, kws, locals);
+        } else if (self->fallbackdef) {
+            /* Have object fallback */
+            retval = call_cfunc(self, self->fallbackdef, args, kws, locals);
+        } else {
+            /* Raise TypeError */
+            explain_matching_error((PyObject *) self, args, kws);
+            retval = NULL;
+        }
+    } else if (self->can_compile) {
+        /* Ambiguous, but are allowed to compile */
+        retval = cuda_compile_only(self, args, kws, locals);
+    } else {
+        /* Ambiguous */
+        explain_ambiguous((PyObject *) self, args, kws);
+        retval = NULL;
+    }
+
+CLEANUP:
+    if (tys != prealloc)
+        delete[] tys;
+    Py_DECREF(args);
+
+    return retval;
+}
+
+static int
+import_devicearray(void)
+{
+    PyObject *devicearray = PyImport_ImportModule("numba._devicearray");
+    if (devicearray == NULL) {
+        return -1;
+    }
+    Py_DECREF(devicearray);
+
+    DeviceArray_API = (void**)PyCapsule_Import("numba._devicearray._DEVICEARRAY_API", 0);
+    if (DeviceArray_API == NULL) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static PyMethodDef Dispatcher_methods[] = {
+    { "_clear", (PyCFunction)Dispatcher_clear, METH_NOARGS, NULL },
+    { "_insert", (PyCFunction)Dispatcher_Insert, METH_VARARGS | METH_KEYWORDS,
+      "insert new definition"},
+    { "_cuda_call", (PyCFunction)Dispatcher_cuda_call,
+      METH_VARARGS | METH_KEYWORDS, "CUDA call resolution" },
+    { NULL },
+};
+
+static PyMemberDef Dispatcher_members[] = {
+    {(char*)"_can_compile", T_BOOL, offsetof(Dispatcher, can_compile), 0, NULL },
+    {(char*)"_enable_sysmon", T_BOOL, offsetof(Dispatcher, enable_sysmon), 0, NULL },
+    {NULL}  /* Sentinel */
+};
+
+
+static PyTypeObject DispatcherType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_dispatcher.Dispatcher",                    /* tp_name */
+    sizeof(Dispatcher),                          /* tp_basicsize */
+    0,                                           /* tp_itemsize */
+    (destructor)Dispatcher_dealloc,              /* tp_dealloc */
+    0,                                           /* tp_vectorcall_offset */
+    0,                                           /* tp_getattr */
+    0,                                           /* tp_setattr */
+    0,                                           /* tp_as_async */
+    0,                                           /* tp_repr */
+    0,                                           /* tp_as_number */
+    0,                                           /* tp_as_sequence */
+    0,                                           /* tp_as_mapping */
+    0,                                           /* tp_hash */
+    (PyCFunctionWithKeywords)Dispatcher_call,    /* tp_call*/
+    0,                                           /* tp_str*/
+    0,                                           /* tp_getattro*/
+    0,                                           /* tp_setattro*/
+    0,                                           /* tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags*/
+    "Dispatcher object",                         /* tp_doc */
+    (traverseproc) Dispatcher_traverse,          /* tp_traverse */
+    0,                                           /* tp_clear */
+    0,                                           /* tp_richcompare */
+    0,                                           /* tp_weaklistoffset */
+    0,                                           /* tp_iter */
+    0,                                           /* tp_iternext */
+    Dispatcher_methods,                          /* tp_methods */
+    Dispatcher_members,                          /* tp_members */
+    0,                                           /* tp_getset */
+    0,                                           /* tp_base */
+    0,                                           /* tp_dict */
+    0,                                           /* tp_descr_get */
+    0,                                           /* tp_descr_set */
+    0,                                           /* tp_dictoffset */
+    (initproc)Dispatcher_init,                   /* tp_init */
+    0,                                           /* tp_alloc */
+    0,                                           /* tp_new */
+    0,                                           /* tp_free */
+    0,                                           /* tp_is_gc */
+    0,                                           /* tp_bases */
+    0,                                           /* tp_mro */
+    0,                                           /* tp_cache */
+    0,                                           /* tp_subclasses */
+    0,                                           /* tp_weaklist */
+    0,                                           /* tp_del */
+    0,                                           /* tp_version_tag */
+    0,                                           /* tp_finalize */
+    0,                                           /* tp_vectorcall */
+#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
+/* This was introduced first in 3.12
+ * https://github.com/python/cpython/issues/91051
+ */
+    0,                                           /* tp_watched */
+#endif
+
+/* WARNING: Do not remove this, only modify it! It is a version guard to
+ * act as a reminder to update this struct on Python version update! */
+#if (PY_MAJOR_VERSION == 3)
+#if ! (NB_SUPPORTED_PYTHON_MINOR)
+#error "Python minor version is not supported."
+#endif
+#else
+#error "Python major version is not supported."
+#endif
+/* END WARNING*/
+};
+
+
+#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+static
+bool is_sysmon_enabled(Dispatcher * self) {
+    return self->enable_sysmon;
+}
+#endif
+
+static PyObject *compute_fingerprint(PyObject *self, PyObject *args)
+{
+    PyObject *val;
+    if (!PyArg_ParseTuple(args, "O:compute_fingerprint", &val))
+        return NULL;
+    return typeof_compute_fingerprint(val);
+}
+
+static PyMethodDef ext_methods[] = {
+#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL }
+    declmethod(typeof_init),
+    declmethod(compute_fingerprint),
+    { NULL },
+#undef declmethod
+};
+
+
+MOD_INIT(_dispatcher) {
+    if (import_devicearray() < 0) {
+      PyErr_Print();
+      PyErr_SetString(PyExc_ImportError, "numba._devicearray failed to import");
+      return MOD_ERROR_VAL;
+    }
+
+    PyObject *m;
+    MOD_DEF(m, "_dispatcher", "No docs", ext_methods)
+    if (m == NULL)
+        return MOD_ERROR_VAL;
+
+    DispatcherType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&DispatcherType) < 0) {
+        return MOD_ERROR_VAL;
+    }
+    Py_INCREF(&DispatcherType);
+    PyModule_AddObject(m, "Dispatcher", (PyObject*)(&DispatcherType));
+
+    return MOD_SUCCESS_VAL(m);
+}
diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp b/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp
new file mode 100644
index 000000000..d926256d3
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp
@@ -0,0 +1,529 @@
+/*
+ * This file and _hashtable.h are from CPython 3.5.  The symbols have been
+ * renamed from _Py_hashxxx to _Numba_hashxxx to avoid name clashes with
+ * the CPython definitions (including at runtime through dynamic linking).
+ * Those CPython APIs are private and can change in incompatible ways at
+ * any time.
+ *
+ * Command line used for renaming:
+ * $ sed -i -r 's/\b_Py_(has[h]table)/_Numba_\1/ig' numba/_hashtable.h numba/_hashtable.c
+ */
+
+/* The implementation of the hash table (_Numba_hashtable_t) is based on the cfuhash
+   project:
+   http://sourceforge.net/projects/libcfu/
+
+   Copyright of cfuhash:
+   ----------------------------------
+   Creation date: 2005-06-24 21:22:40
+   Authors: Don
+   Change log:
+
+   Copyright (c) 2005 Don Owens
+   All rights reserved.
+
+   This code is released under the BSD license:
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+     * Neither the name of the author nor the names of its
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+   COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+   OF THE POSSIBILITY OF SUCH DAMAGE.
+   ----------------------------------
+*/
+
+#include "_pymodule.h"
+#include "_hashtable.h"
+
+#define HASHTABLE_MIN_SIZE 16
+#define HASHTABLE_HIGH 0.50
+#define HASHTABLE_LOW 0.10
+#define HASHTABLE_REHASH_FACTOR 2.0 / (HASHTABLE_LOW + HASHTABLE_HIGH)
+
+#define BUCKETS_HEAD(SLIST) \
+        ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(SLIST)))
+#define TABLE_HEAD(HT, BUCKET) \
+        ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(HT)->buckets[BUCKET]))
+#define ENTRY_NEXT(ENTRY) \
+        ((_Numba_hashtable_entry_t *)_Py_SLIST_ITEM_NEXT(ENTRY))
+#define HASHTABLE_ITEM_SIZE(HT) \
+        (sizeof(_Numba_hashtable_entry_t) + (HT)->data_size)
+
+/* Forward declaration */
+static void hashtable_rehash(_Numba_hashtable_t *ht);
+
+static void
+_Py_slist_init(_Py_slist_t *list)
+{
+    list->head = NULL;
+}
+
+static void
+_Py_slist_prepend(_Py_slist_t *list, _Py_slist_item_t *item)
+{
+    item->next = list->head;
+    list->head = item;
+}
+
+static void
+_Py_slist_remove(_Py_slist_t *list, _Py_slist_item_t *previous,
+                 _Py_slist_item_t *item)
+{
+    if (previous != NULL)
+        previous->next = item->next;
+    else
+        list->head = item->next;
+}
+
+extern "C" Py_uhash_t
+_Numba_hashtable_hash_int(const void *key)
+{
+    return (Py_uhash_t)key;
+}
+
+extern "C" Py_uhash_t
+_Numba_hashtable_hash_ptr(const void *key)
+{
+    return (Py_uhash_t)_Py_HashPointer((void *)key);
+}
+
+extern "C" int
+_Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry)
+{
+    return entry->key == key;
+}
+
+/* makes sure the real size of the buckets array is a power of 2 */
+static size_t
+round_size(size_t s)
+{
+    size_t i;
+    if (s < HASHTABLE_MIN_SIZE)
+        return HASHTABLE_MIN_SIZE;
+    i = 1;
+    while (i < s)
+        i <<= 1;
+    return i;
+}
+
+extern "C" _Numba_hashtable_t *
+_Numba_hashtable_new_full(size_t data_size, size_t init_size,
+                       _Numba_hashtable_hash_func hash_func,
+                       _Numba_hashtable_compare_func compare_func,
+                       _Numba_hashtable_copy_data_func copy_data_func,
+                       _Numba_hashtable_free_data_func free_data_func,
+                       _Numba_hashtable_get_data_size_func get_data_size_func,
+                       _Numba_hashtable_allocator_t *allocator)
+{
+    _Numba_hashtable_t *ht;
+    size_t buckets_size;
+    _Numba_hashtable_allocator_t alloc;
+
+    if (allocator == NULL) {
+        alloc.malloc = PyMem_RawMalloc;
+        alloc.free = PyMem_RawFree;
+    }
+    else
+        alloc = *allocator;
+
+    ht = (_Numba_hashtable_t *)alloc.malloc(sizeof(_Numba_hashtable_t));
+    if (ht == NULL)
+        return ht;
+
+    ht->num_buckets = round_size(init_size);
+    ht->entries = 0;
+    ht->data_size = data_size;
+
+    buckets_size = ht->num_buckets * sizeof(ht->buckets[0]);
+    ht->buckets = (_Py_slist_t *) alloc.malloc(buckets_size);
+    if (ht->buckets == NULL) {
+        alloc.free(ht);
+        return NULL;
+    }
+    memset(ht->buckets, 0, buckets_size);
+
+    ht->hash_func = hash_func;
+    ht->compare_func = compare_func;
+    ht->copy_data_func = copy_data_func;
+    ht->free_data_func = free_data_func;
+    ht->get_data_size_func = get_data_size_func;
+    ht->alloc = alloc;
+    return ht;
+}
+
+extern "C" _Numba_hashtable_t *
+_Numba_hashtable_new(size_t data_size,
+                  _Numba_hashtable_hash_func hash_func,
+                  _Numba_hashtable_compare_func compare_func)
+{
+    return _Numba_hashtable_new_full(data_size, HASHTABLE_MIN_SIZE,
+                                  hash_func, compare_func,
+                                  NULL, NULL, NULL, NULL);
+}
+
+extern "C" size_t
+_Numba_hashtable_size(_Numba_hashtable_t *ht)
+{
+    size_t size;
+    size_t hv;
+
+    size = sizeof(_Numba_hashtable_t);
+
+    /* buckets */
+    size += ht->num_buckets * sizeof(_Numba_hashtable_entry_t *);
+
+    /* entries */
+    size += ht->entries * HASHTABLE_ITEM_SIZE(ht);
+
+    /* data linked from entries */
+    if (ht->get_data_size_func) {
+        for (hv = 0; hv < ht->num_buckets; hv++) {
+            _Numba_hashtable_entry_t *entry;
+
+            for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) {
+                void *data;
+
+                data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry);
+                size += ht->get_data_size_func(data);
+            }
+        }
+    }
+    return size;
+}
+
+#ifdef Py_DEBUG
+extern "C" void
+_Numba_hashtable_print_stats(_Numba_hashtable_t *ht)
+{
+    size_t size;
+    size_t chain_len, max_chain_len, total_chain_len, nchains;
+    _Numba_hashtable_entry_t *entry;
+    size_t hv;
+    double load;
+
+    size = _Numba_hashtable_size(ht);
+
+    load = (double)ht->entries / ht->num_buckets;
+
+    max_chain_len = 0;
+    total_chain_len = 0;
+    nchains = 0;
+    for (hv = 0; hv < ht->num_buckets; hv++) {
+        entry = TABLE_HEAD(ht, hv);
+        if (entry != NULL) {
+            chain_len = 0;
+            for (; entry; entry = ENTRY_NEXT(entry)) {
+                chain_len++;
+            }
+            if (chain_len > max_chain_len)
+                max_chain_len = chain_len;
+            total_chain_len += chain_len;
+            nchains++;
+        }
+    }
+    printf("hash table %p: entries=%"
+           PY_FORMAT_SIZE_T "u/%" PY_FORMAT_SIZE_T "u (%.0f%%), ",
+           ht, ht->entries, ht->num_buckets, load * 100.0);
+    if (nchains)
+        printf("avg_chain_len=%.1f, ", (double)total_chain_len / nchains);
+    printf("max_chain_len=%" PY_FORMAT_SIZE_T "u, %" PY_FORMAT_SIZE_T "u kB\n",
+           max_chain_len, size / 1024);
+}
+#endif
+
+/* Get an entry. Return NULL if the key does not exist. */
+extern "C" _Numba_hashtable_entry_t *
+_Numba_hashtable_get_entry(_Numba_hashtable_t *ht, const void *key)
+{
+    Py_uhash_t key_hash;
+    size_t index;
+    _Numba_hashtable_entry_t *entry;
+
+    key_hash = ht->hash_func(key);
+    index = key_hash & (ht->num_buckets - 1);
+
+    for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) {
+        if (entry->key_hash == key_hash && ht->compare_func(key, entry))
+            break;
+    }
+
+    return entry;
+}
+
+static int
+_hashtable_pop_entry(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size)
+{
+    Py_uhash_t key_hash;
+    size_t index;
+    _Numba_hashtable_entry_t *entry, *previous;
+
+    key_hash = ht->hash_func(key);
+    index = key_hash & (ht->num_buckets - 1);
+
+    previous = NULL;
+    for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) {
+        if (entry->key_hash == key_hash && ht->compare_func(key, entry))
+            break;
+        previous = entry;
+    }
+
+    if (entry == NULL)
+        return 0;
+
+    _Py_slist_remove(&ht->buckets[index], (_Py_slist_item_t *)previous,
+                     (_Py_slist_item_t *)entry);
+    ht->entries--;
+
+    if (data != NULL)
+        _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry);
+    ht->alloc.free(entry);
+
+    if ((float)ht->entries / (float)ht->num_buckets < HASHTABLE_LOW)
+        hashtable_rehash(ht);
+    return 1;
+}
+
+/* Add a new entry to the hash. The key must not be present in the hash table.
+   Return 0 on success, -1 on memory error. */
+extern "C" int
+_Numba_hashtable_set(_Numba_hashtable_t *ht, const void *key,
+                  void *data, size_t data_size)
+{
+    Py_uhash_t key_hash;
+    size_t index;
+    _Numba_hashtable_entry_t *entry;
+
+    assert(data != NULL || data_size == 0);
+#ifndef NDEBUG
+    /* Don't write the assertion on a single line because it is interesting
+       to know the duplicated entry if the assertion failed. The entry can
+       be read using a debugger. */
+    entry = _Numba_hashtable_get_entry(ht, key);
+    assert(entry == NULL);
+#endif
+
+    key_hash = ht->hash_func(key);
+    index = key_hash & (ht->num_buckets - 1);
+
+    entry = (_Numba_hashtable_entry_t *) ht->alloc.malloc(HASHTABLE_ITEM_SIZE(ht));
+    if (entry == NULL) {
+        /* memory allocation failed */
+        return -1;
+    }
+
+    entry->key = (void *)key;
+    entry->key_hash = key_hash;
+
+    assert(data_size == ht->data_size);
+    memcpy(_Numba_HASHTABLE_ENTRY_DATA(entry), data, data_size);
+
+    _Py_slist_prepend(&ht->buckets[index], (_Py_slist_item_t*)entry);
+    ht->entries++;
+
+    if ((float)ht->entries / (float)ht->num_buckets > HASHTABLE_HIGH)
+        hashtable_rehash(ht);
+    return 0;
+}
+
+/* Get data from an entry. Copy entry data into data and return 1 if the entry
+   exists, return 0 if the entry does not exist. */
+extern "C" int
+_Numba_hashtable_get(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size)
+{
+    _Numba_hashtable_entry_t *entry;
+
+    assert(data != NULL);
+
+    entry = _Numba_hashtable_get_entry(ht, key);
+    if (entry == NULL)
+        return 0;
+    _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry);
+    return 1;
+}
+
+extern "C" int
+_Numba_hashtable_pop(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size)
+{
+    assert(data != NULL);
+    assert(ht->free_data_func == NULL);
+    return _hashtable_pop_entry(ht, key, data, data_size);
+}
+
+/* Delete an entry. The entry must exist. */
+extern "C" void
+_Numba_hashtable_delete(_Numba_hashtable_t *ht, const void *key)
+{
+#ifndef NDEBUG
+    int found = _hashtable_pop_entry(ht, key, NULL, 0);
+    assert(found);
+#else
+    (void)_hashtable_pop_entry(ht, key, NULL, 0);
+#endif
+}
+
+/* Prototype for a pointer to a function to be called foreach
+   key/value pair in the hash by hashtable_foreach().  Iteration
+   stops if a non-zero value is returned. */
+extern "C" int
+_Numba_hashtable_foreach(_Numba_hashtable_t *ht,
+                      int (*func) (_Numba_hashtable_entry_t *entry, void *arg),
+                      void *arg)
+{
+    _Numba_hashtable_entry_t *entry;
+    size_t hv;
+
+    for (hv = 0; hv < ht->num_buckets; hv++) {
+        for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) {
+            int res = func(entry, arg);
+            if (res)
+                return res;
+        }
+    }
+    return 0;
+}
+
+static void
+hashtable_rehash(_Numba_hashtable_t *ht)
+{
+    size_t buckets_size, new_size, bucket;
+    _Py_slist_t *old_buckets = NULL;
+    size_t old_num_buckets;
+
+    new_size = round_size((size_t)(ht->entries * HASHTABLE_REHASH_FACTOR));
+    if (new_size == ht->num_buckets)
+        return;
+
+    old_num_buckets = ht->num_buckets;
+
+    buckets_size = new_size * sizeof(ht->buckets[0]);
+    old_buckets = ht->buckets;
+    ht->buckets = (_Py_slist_t *) ht->alloc.malloc(buckets_size);
+    if (ht->buckets == NULL) {
+        /* cancel rehash on memory allocation failure */
+        ht->buckets = old_buckets ;
+        /* memory allocation failed */
+        return;
+    }
+    memset(ht->buckets, 0, buckets_size);
+
+    ht->num_buckets = new_size;
+
+    for (bucket = 0; bucket < old_num_buckets; bucket++) {
+        _Numba_hashtable_entry_t *entry, *next;
+        for (entry = BUCKETS_HEAD(old_buckets[bucket]); entry != NULL; entry = next) {
+            size_t entry_index;
+
+            assert(ht->hash_func(entry->key) == entry->key_hash);
+            next = ENTRY_NEXT(entry);
+            entry_index = entry->key_hash & (new_size - 1);
+
+            _Py_slist_prepend(&ht->buckets[entry_index], (_Py_slist_item_t*)entry);
+        }
+    }
+
+    ht->alloc.free(old_buckets);
+}
+
+extern "C" void
+_Numba_hashtable_clear(_Numba_hashtable_t *ht)
+{
+    _Numba_hashtable_entry_t *entry, *next;
+    size_t i;
+
+    for (i=0; i < ht->num_buckets; i++) {
+        for (entry = TABLE_HEAD(ht, i); entry != NULL; entry = next) {
+            next = ENTRY_NEXT(entry);
+            if (ht->free_data_func)
+                ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry));
+            ht->alloc.free(entry);
+        }
+        _Py_slist_init(&ht->buckets[i]);
+    }
+    ht->entries = 0;
+    hashtable_rehash(ht);
+}
+
+extern "C" void
+_Numba_hashtable_destroy(_Numba_hashtable_t *ht)
+{
+    size_t i;
+
+    for (i = 0; i < ht->num_buckets; i++) {
+        _Py_slist_item_t *entry = ht->buckets[i].head;
+        while (entry) {
+            _Py_slist_item_t *entry_next = entry->next;
+            if (ht->free_data_func)
+                ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry));
+            ht->alloc.free(entry);
+            entry = entry_next;
+        }
+    }
+
+    ht->alloc.free(ht->buckets);
+    ht->alloc.free(ht);
+}
+
+/* Return a copy of the hash table */
+extern "C" _Numba_hashtable_t *
+_Numba_hashtable_copy(_Numba_hashtable_t *src)
+{
+    _Numba_hashtable_t *dst;
+    _Numba_hashtable_entry_t *entry;
+    size_t bucket;
+    int err;
+    void *data, *new_data;
+
+    dst = _Numba_hashtable_new_full(src->data_size, src->num_buckets,
+                            src->hash_func, src->compare_func,
+                            src->copy_data_func, src->free_data_func,
+                            src->get_data_size_func, &src->alloc);
+    if (dst == NULL)
+        return NULL;
+
+    for (bucket=0; bucket < src->num_buckets; bucket++) {
+        entry = TABLE_HEAD(src, bucket);
+        for (; entry; entry = ENTRY_NEXT(entry)) {
+            if (src->copy_data_func) {
+                data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry);
+                new_data = src->copy_data_func(data);
+                if (new_data != NULL)
+                    err = _Numba_hashtable_set(dst, entry->key,
+                                        &new_data, src->data_size);
+                else
+                    err = 1;
+            }
+            else {
+                data = _Numba_HASHTABLE_ENTRY_DATA(entry);
+                err = _Numba_hashtable_set(dst, entry->key, data, src->data_size);
+            }
+            if (err) {
+                _Numba_hashtable_destroy(dst);
+                return NULL;
+            }
+        }
+    }
+    return dst;
+}
diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.h b/numba_cuda/numba/cuda/_dispatcher/_hashtable.h
new file mode 100644
index 000000000..fbc6d6013
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_hashtable.h
@@ -0,0 +1,132 @@
+/*
+ * See _hashtable.c for more information about this file.
+ */
+
+#ifndef Py_HASHTABLE_H
+#define Py_HASHTABLE_H
+
+/* The whole API is private */
+#ifndef Py_LIMITED_API
+
+typedef struct _Py_slist_item_s {
+    struct _Py_slist_item_s *next;
+} _Py_slist_item_t;
+
+typedef struct {
+    _Py_slist_item_t *head;
+} _Py_slist_t;
+
+#define _Py_SLIST_ITEM_NEXT(ITEM) (((_Py_slist_item_t *)ITEM)->next)
+
+#define _Py_SLIST_HEAD(SLIST) (((_Py_slist_t *)SLIST)->head)
+
+typedef struct {
+    /* used by _Numba_hashtable_t.buckets to link entries */
+    _Py_slist_item_t _Py_slist_item;
+
+    const void *key;
+    Py_uhash_t key_hash;
+
+    /* data follows */
+} _Numba_hashtable_entry_t;
+
+#define _Numba_HASHTABLE_ENTRY_DATA(ENTRY) \
+        ((char *)(ENTRY) + sizeof(_Numba_hashtable_entry_t))
+
+#define _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(ENTRY) \
+        (*(void **)_Numba_HASHTABLE_ENTRY_DATA(ENTRY))
+
+#define _Numba_HASHTABLE_ENTRY_READ_DATA(TABLE, DATA, DATA_SIZE, ENTRY) \
+    do { \
+        assert((DATA_SIZE) == (TABLE)->data_size); \
+        memcpy(DATA, _Numba_HASHTABLE_ENTRY_DATA(ENTRY), DATA_SIZE); \
+    } while (0)
+
+typedef Py_uhash_t (*_Numba_hashtable_hash_func) (const void *key);
+typedef int (*_Numba_hashtable_compare_func) (const void *key, const _Numba_hashtable_entry_t *he);
+typedef void* (*_Numba_hashtable_copy_data_func)(void *data);
+typedef void (*_Numba_hashtable_free_data_func)(void *data);
+typedef size_t (*_Numba_hashtable_get_data_size_func)(void *data);
+
+typedef struct {
+    /* allocate a memory block */
+    void* (*malloc) (size_t size);
+
+    /* release a memory block */
+    void (*free) (void *ptr);
+} _Numba_hashtable_allocator_t;
+
+typedef struct {
+    size_t num_buckets;
+    size_t entries; /* Total number of entries in the table. */
+    _Py_slist_t *buckets;
+    size_t data_size;
+
+    _Numba_hashtable_hash_func hash_func;
+    _Numba_hashtable_compare_func compare_func;
+    _Numba_hashtable_copy_data_func copy_data_func;
+    _Numba_hashtable_free_data_func free_data_func;
+    _Numba_hashtable_get_data_size_func get_data_size_func;
+    _Numba_hashtable_allocator_t alloc;
+} _Numba_hashtable_t;
+
+/* hash and compare functions for integers and pointers */
+extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_ptr(const void *key);
+extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_int(const void *key);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry);
+
+extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new(
+    size_t data_size,
+    _Numba_hashtable_hash_func hash_func,
+    _Numba_hashtable_compare_func compare_func);
+extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new_full(
+    size_t data_size,
+    size_t init_size,
+    _Numba_hashtable_hash_func hash_func,
+    _Numba_hashtable_compare_func compare_func,
+    _Numba_hashtable_copy_data_func copy_data_func,
+    _Numba_hashtable_free_data_func free_data_func,
+    _Numba_hashtable_get_data_size_func get_data_size_func,
+    _Numba_hashtable_allocator_t *allocator);
+extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_copy(_Numba_hashtable_t *src);
+extern "C" PyAPI_FUNC(void) _Numba_hashtable_clear(_Numba_hashtable_t *ht);
+extern "C" PyAPI_FUNC(void) _Numba_hashtable_destroy(_Numba_hashtable_t *ht);
+
+typedef int (*_Numba_hashtable_foreach_func) (_Numba_hashtable_entry_t *entry, void *arg);
+
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_foreach(
+    _Numba_hashtable_t *ht,
+    _Numba_hashtable_foreach_func func, void *arg);
+extern "C" PyAPI_FUNC(size_t) _Numba_hashtable_size(_Numba_hashtable_t *ht);
+
+extern "C" PyAPI_FUNC(_Numba_hashtable_entry_t*) _Numba_hashtable_get_entry(
+    _Numba_hashtable_t *ht,
+    const void *key);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_set(
+    _Numba_hashtable_t *ht,
+    const void *key,
+    void *data,
+    size_t data_size);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_get(
+    _Numba_hashtable_t *ht,
+    const void *key,
+    void *data,
+    size_t data_size);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_pop(
+    _Numba_hashtable_t *ht,
+    const void *key,
+    void *data,
+    size_t data_size);
+extern "C" PyAPI_FUNC(void) _Numba_hashtable_delete(
+    _Numba_hashtable_t *ht,
+    const void *key);
+
+#define _Numba_HASHTABLE_SET(TABLE, KEY, DATA) \
+    _Numba_hashtable_set(TABLE, KEY, &(DATA), sizeof(DATA))
+
+#define _Numba_HASHTABLE_GET(TABLE, KEY, DATA) \
+    _Numba_hashtable_get(TABLE, KEY, &(DATA), sizeof(DATA))
+
+#endif   /* Py_LIMITED_API */
+
+#endif
diff --git a/numba_cuda/numba/cuda/_dispatcher/_numba_common.h b/numba_cuda/numba/cuda/_dispatcher/_numba_common.h
new file mode 100644
index 000000000..d458e4240
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_numba_common.h
@@ -0,0 +1,43 @@
+#ifndef NUMBA_COMMON_H_
+#define NUMBA_COMMON_H_
+
+/* __has_attribute() is a clang / gcc-5 macro */
+#ifndef __has_attribute
+#   define __has_attribute(x) 0
+#endif
+
+/* This attribute marks symbols that can be shared across C objects
+ * but are not exposed outside of a shared library or executable.
+ * Note this is default behaviour for global symbols under Windows.
+ */
+#if defined(_MSC_VER)
+    #define VISIBILITY_HIDDEN
+    #define VISIBILITY_GLOBAL __declspec(dllexport)
+#elif (__has_attribute(visibility) || (defined(__GNUC__) && __GNUC__ >= 4))
+    #define VISIBILITY_HIDDEN __attribute__ ((visibility("hidden")))
+    #define VISIBILITY_GLOBAL __attribute__ ((visibility("default")))
+#else
+    #define VISIBILITY_HIDDEN
+    #define VISIBILITY_GLOBAL
+#endif
+
+/*
+ * Numba's version of the PyArray_DescrCheck macro from NumPy, use it as a
+ * direct replacement of NumPy's PyArray_DescrCheck to ensure binary
+ * compatibility.
+ *
+ * Details of why this is needed:
+ * NumPy 1.18 changed the definition of the PyArray_DescrCheck macro here:
+ * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698
+ * the result of this being that building against NumPy <1.18 would prevent
+ * Numba running against NumPy >= 1.20 as noted here:
+ * https://github.com/numba/numba/issues/6041#issuecomment-665132199
+ *
+ * This macro definition is copied from:
+ * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698#diff-ad2213da23136c5fc5883d9eb2d88666R26
+ *
+ * NOTE: This is the NumPy 1.18 and above version of the macro.
+ */
+#define NUMBA_PyArray_DescrCheck(op) PyObject_TypeCheck(op, &PyArrayDescr_Type)
+
+#endif /* NUMBA_COMMON_H_ */
diff --git a/numba_cuda/numba/cuda/_dispatcher/_pymodule.h b/numba_cuda/numba/cuda/_dispatcher/_pymodule.h
new file mode 100644
index 000000000..c261314f5
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_pymodule.h
@@ -0,0 +1,35 @@
+#ifndef NUMBA_PY_MODULE_H_
+#define NUMBA_PY_MODULE_H_
+
+#define PY_SSIZE_T_CLEAN
+
+#include "Python.h"
+#include "structmember.h"
+#include "frameobject.h"
+
+#define MOD_ERROR_VAL NULL
+#define MOD_SUCCESS_VAL(val) val
+#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
+#define MOD_DEF(ob, name, doc, methods) { \
+        static struct PyModuleDef moduledef = { \
+          PyModuleDef_HEAD_INIT, name, doc, -1, methods, NULL, NULL, NULL, NULL }; \
+        ob = PyModule_Create(&moduledef); }
+#define MOD_INIT_EXEC(name) PyInit_##name();
+
+#define PyString_AsString PyUnicode_AsUTF8
+#define PyString_Check PyUnicode_Check
+#define PyString_FromFormat PyUnicode_FromFormat
+#define PyString_FromString PyUnicode_FromString
+#define PyString_InternFromString PyUnicode_InternFromString
+#define PyInt_Type PyLong_Type
+#define PyInt_Check PyLong_Check
+#define PyInt_CheckExact PyLong_CheckExact
+#define SetAttrStringFromVoidPointer(m, name) do { \
+        PyObject *tmp = PyLong_FromVoidPtr((void *) &name); \
+        PyObject_SetAttrString(m, #name, tmp); \
+        Py_DECREF(tmp); } while (0)
+
+
+#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+
+#endif /* NUMBA_PY_MODULE_H_ */
diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
new file mode 100644
index 000000000..99a1bd23e
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
@@ -0,0 +1,1170 @@
+#include "_pymodule.h"
+
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+
+#include "_numba_common.h"
+#include "_typeof.h"
+#include "_hashtable.h"
+#include "_devicearray.h"
+#include "pyerrors.h"
+
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
+#if NPY_ABI_VERSION >= 0x02000000
+    #include <numpy/npy_2_compat.h>
+#endif
+
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 13)
+    #ifndef Py_BUILD_CORE
+        #define Py_BUILD_CORE 1
+    #endif
+    #include "internal/pycore_setobject.h"  // _PySet_NextEntry()
+#endif
+
+
+/* Cached typecodes for basic scalar types */
+static int tc_int8;
+static int tc_int16;
+static int tc_int32;
+static int tc_int64;
+static int tc_uint8;
+static int tc_uint16;
+static int tc_uint32;
+static int tc_uint64;
+static int tc_float32;
+static int tc_float64;
+static int tc_complex64;
+static int tc_complex128;
+static int BASIC_TYPECODES[12];
+
+static int tc_intp;
+
+/* The type object for the numba .dispatcher.OmittedArg class
+ * that wraps omitted arguments.
+ */
+static PyObject *omittedarg_type;
+
+static PyObject *typecache;
+static PyObject *ndarray_typecache;
+static PyObject *structured_dtypes;
+
+static PyObject *str_typeof_pyval = NULL;
+static PyObject *str_value = NULL;
+static PyObject *str_numba_type = NULL;
+
+/* CUDA device array API */
+void **DeviceArray_API;
+
+/*
+ * Type fingerprint computation.
+ */
+
+typedef struct {
+    /* A buffer the fingerprint will be written to */
+    char *buf;
+    size_t n;
+    size_t allocated;
+    /* A preallocated buffer, sufficient to fit the fingerprint for most types */
+    char static_buf[40];
+} string_writer_t;
+
+static void
+string_writer_init(string_writer_t *w)
+{
+    w->buf = w->static_buf;
+    w->n = 0;
+    w->allocated = sizeof(w->static_buf) / sizeof(unsigned char);
+}
+
+static void
+string_writer_clear(string_writer_t *w)
+{
+    if (w->buf != w->static_buf)
+        free(w->buf);
+}
+
+static void
+string_writer_move(string_writer_t *dest, const string_writer_t *src)
+{
+    dest->n = src->n;
+    dest->allocated = src->allocated;
+    if (src->buf == src->static_buf) {
+        dest->buf = dest->static_buf;
+        memcpy(dest->buf, src->buf, src->n);
+    }
+    else {
+        dest->buf = src->buf;
+    }
+}
+
+/* Ensure at least *bytes* can be appended to the string writer's buffer. */
+static int
+string_writer_ensure(string_writer_t *w, size_t bytes)
+{
+    size_t newsize;
+    bytes += w->n;
+    if (bytes <= w->allocated)
+        return 0;
+    newsize = (w->allocated << 2) + 1;
+    if (newsize < bytes)
+        newsize = bytes;
+    if (w->buf == w->static_buf) {
+        w->buf = (char *) malloc(newsize);
+        memcpy(w->buf, w->static_buf, w->allocated);
+    }
+    else
+        w->buf = (char *) realloc(w->buf, newsize);
+    if (w->buf) {
+        w->allocated = newsize;
+        return 0;
+    }
+    else {
+        PyErr_NoMemory();
+        return -1;
+    }
+}
+
+static int
+string_writer_put_char(string_writer_t *w, unsigned char c)
+{
+    if (string_writer_ensure(w, 1))
+        return -1;
+    w->buf[w->n++] = c;
+    return 0;
+}
+
+static int
+string_writer_put_int32(string_writer_t *w, unsigned int v)
+{
+    if (string_writer_ensure(w, 4))
+        return -1;
+    w->buf[w->n] = v & 0xff;
+    w->buf[w->n + 1] = (v >> 8) & 0xff;
+    w->buf[w->n + 2] = (v >> 16) & 0xff;
+    w->buf[w->n + 3] = (v >> 24) & 0xff;
+    w->n += 4;
+    return 0;
+}
+
+static int
+string_writer_put_intp(string_writer_t *w, npy_intp v)
+{
+    if (string_writer_ensure(w, NPY_SIZEOF_PY_INTPTR_T))
+        return -1;
+    w->buf[w->n] = v & 0xff;
+    w->buf[w->n + 1] = (v >> 8) & 0xff;
+    w->buf[w->n + 2] = (v >> 16) & 0xff;
+    w->buf[w->n + 3] = (v >> 24) & 0xff;
+#if NPY_SIZEOF_PY_INTPTR_T == 8
+    w->buf[w->n + 4] = (v >> 32) & 0xff;
+    w->buf[w->n + 5] = (v >> 40) & 0xff;
+    w->buf[w->n + 6] = (v >> 48) & 0xff;
+    w->buf[w->n + 7] = (v >> 56) & 0xff;
+#endif
+    w->n += NPY_SIZEOF_PY_INTPTR_T;
+    return 0;
+}
+
+static int
+string_writer_put_string(string_writer_t *w, const char *s)
+{
+    if (s == NULL) {
+        return string_writer_put_char(w, 0);
+    }
+    else {
+        size_t N = strlen(s) + 1;
+        if (string_writer_ensure(w, N))
+            return -1;
+        memcpy(w->buf + w->n, s, N);
+        w->n += N;
+        return 0;
+    }
+}
+
+enum opcode {
+    OP_START_TUPLE = '(',
+    OP_END_TUPLE = ')',
+    OP_INT = 'i',
+    OP_FLOAT = 'f',
+    OP_COMPLEX = 'c',
+    OP_BOOL = '?',
+    OP_OMITTED = '!',
+
+    OP_BYTEARRAY = 'a',
+    OP_BYTES = 'b',
+    OP_NONE = 'n',
+    OP_LIST = '[',
+    OP_SET = '{',
+
+    OP_BUFFER = 'B',
+    OP_NP_SCALAR = 'S',
+    OP_NP_ARRAY = 'A',
+    OP_NP_DTYPE = 'D'
+};
+
+#define TRY(func, w, arg) \
+    do { \
+        if (func(w, arg)) return -1; \
+    } while (0)
+
+
+static int
+fingerprint_unrecognized(void)
+{
+    PyErr_SetString(PyExc_NotImplementedError,
+                    "cannot compute type fingerprint for value");
+    return -1;
+}
+
+static int
+compute_dtype_fingerprint(string_writer_t *w, PyArray_Descr *descr)
+{
+    int typenum = descr->type_num;
+    if (typenum < NPY_OBJECT)
+        return string_writer_put_char(w, (char) typenum);
+    if (typenum == NPY_VOID) {
+        /* Structured dtype: serialize the dtype pointer.  Unfortunately,
+         * some structured dtypes can be ephemeral, so we have to
+         * intern them to avoid pointer reuse and fingerprint collisions.
+         * (e.g. np.recarray(dtype=some_dtype) creates a new dtype
+         *  equal to some_dtype)
+         */
+        PyObject *interned = PyDict_GetItem(structured_dtypes,
+                                            (PyObject *) descr);
+        if (interned == NULL) {
+            interned = (PyObject *) descr;
+            if (PyDict_SetItem(structured_dtypes, interned, interned))
+                return -1;
+        }
+        TRY(string_writer_put_char, w, (char) typenum);
+        return string_writer_put_intp(w, (npy_intp) interned);
+    }
+#if NPY_API_VERSION >= 0x00000007
+    if (PyTypeNum_ISDATETIME(typenum)) {
+        PyArray_DatetimeMetaData *md;
+#if NPY_ABI_VERSION >= 0x02000000
+        md = &(((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(descr))->meta);
+#else
+        md = &(((PyArray_DatetimeDTypeMetaData *)descr->c_metadata)->meta);
+#endif
+        TRY(string_writer_put_char, w, (char) typenum);
+        TRY(string_writer_put_char, w, (char) md->base);
+        return string_writer_put_int32(w, (char) md->num);
+    }
+#endif
+
+    return fingerprint_unrecognized();
+}
+
+static int
+compute_fingerprint(string_writer_t *w, PyObject *val)
+{
+    /*
+     * Implementation note: for performance, we start with common
+     * types that can be tested with fast checks.
+     */
+    if (val == Py_None)
+        return string_writer_put_char(w, OP_NONE);
+    if (PyBool_Check(val))
+        return string_writer_put_char(w, OP_BOOL);
+    /* Note we avoid matching int subclasses such as IntEnum */
+    if (PyInt_CheckExact(val) || PyLong_CheckExact(val))
+        return string_writer_put_char(w, OP_INT);
+    if (PyFloat_Check(val))
+        return string_writer_put_char(w, OP_FLOAT);
+    if (PyComplex_CheckExact(val))
+        return string_writer_put_char(w, OP_COMPLEX);
+    if (PyTuple_Check(val)) {
+        if(PyTuple_CheckExact(val)) {
+            Py_ssize_t i, n;
+            n = PyTuple_GET_SIZE(val);
+            TRY(string_writer_put_char, w, OP_START_TUPLE);
+            for (i = 0; i < n; i++)
+                TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i));
+            TRY(string_writer_put_char, w, OP_END_TUPLE);
+            return 0;
+        }
+        /* as per typeof.py, check "_asdict" for namedtuple. */
+        else if(PyObject_HasAttrString(val, "_asdict"))
+        {
+            /*
+             * This encodes the class name and field names of a namedtuple into
+             * the fingerprint on the condition that the number of fields is
+             * small (<10) and that the class name and field names are encodable
+             * as ASCII.
+             */
+            PyObject * clazz = NULL;
+            PyObject * name = NULL;
+            PyObject * _fields =  PyObject_GetAttrString(val, "_fields");
+            PyObject * field = NULL;
+            PyObject * ascii_str = NULL;
+            Py_ssize_t i, n, j, flen;
+            char * buf = NULL;
+            int ret;
+
+            clazz = PyObject_GetAttrString(val, "__class__");
+            if (clazz == NULL)
+                return -1;
+
+            name = PyObject_GetAttrString(clazz, "__name__");
+            Py_DECREF(clazz);
+            if (name == NULL)
+                return -1;
+
+            ascii_str = PyUnicode_AsEncodedString(name, "ascii", "ignore");
+            Py_DECREF(name);
+            if (ascii_str == NULL)
+                return -1;
+            ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen);
+
+            if (ret == -1)
+                return -1;
+            for(j = 0; j < flen; j++) {
+                TRY(string_writer_put_char, w, buf[j]);
+            }
+            Py_DECREF(ascii_str);
+
+            if (_fields == NULL)
+                return -1;
+
+            n = PyTuple_GET_SIZE(val);
+
+            TRY(string_writer_put_char, w, OP_START_TUPLE);
+            for (i = 0; i < n; i++) {
+                field = PyTuple_GET_ITEM(_fields, i);
+                if (field == NULL)
+                    return -1;
+                ascii_str = PyUnicode_AsEncodedString(field, "ascii", "ignore");
+                if (ascii_str == NULL)
+                    return -1;
+                ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen);
+                if (ret == -1)
+                    return -1;
+                for(j = 0; j < flen; j++) {
+                    TRY(string_writer_put_char, w, buf[j]);
+                }
+                Py_DECREF(ascii_str);
+                TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i));
+            }
+            TRY(string_writer_put_char, w, OP_END_TUPLE);
+            Py_DECREF(_fields);
+            return 0;
+        }
+    }
+    if (PyBytes_Check(val))
+        return string_writer_put_char(w, OP_BYTES);
+    if (PyByteArray_Check(val))
+        return string_writer_put_char(w, OP_BYTEARRAY);
+    if ((PyObject *) Py_TYPE(val) == omittedarg_type) {
+        PyObject *default_val = PyObject_GetAttr(val, str_value);
+        if (default_val == NULL)
+            return -1;
+        TRY(string_writer_put_char, w, OP_OMITTED);
+        TRY(compute_fingerprint, w, default_val);
+        Py_DECREF(default_val);
+        return 0;
+    }
+
+    /* Skip numpy scalar check to prevent segfault */
+    // if (PyArray_IsScalar(val, Generic)) {
+    //     PyArray_Descr *descr = PyArray_DescrFromScalar(val);
+    //     if (descr == NULL)
+    //         return -1;
+    //     TRY(string_writer_put_char, w, OP_NP_SCALAR);
+    //     TRY(compute_dtype_fingerprint, w, descr);
+    //     Py_DECREF(descr);
+    //     return 0;
+    // }
+
+    /* Skip numpy array check to prevent segfault */
+    // if (PyArray_Check(val)) {
+    //     PyArrayObject *ary = (PyArrayObject *) val;
+    //     int ndim = PyArray_NDIM(ary);
+
+    //     TRY(string_writer_put_char, w, OP_NP_ARRAY);
+    //     TRY(string_writer_put_int32, w, ndim);
+    //     if (PyArray_IS_C_CONTIGUOUS(ary))
+    //         TRY(string_writer_put_char, w, 'C');
+    //     else if (PyArray_IS_F_CONTIGUOUS(ary))
+    //         TRY(string_writer_put_char, w, 'F');
+    //     else
+    //         TRY(string_writer_put_char, w, 'A');
+    //     if (PyArray_ISWRITEABLE(ary))
+    //         TRY(string_writer_put_char, w, 'W');
+    //     else
+    //         TRY(string_writer_put_char, w, 'R');
+    //     return compute_dtype_fingerprint(w, PyArray_DESCR(ary));
+    // }
+
+    if (PyList_Check(val)) {
+        Py_ssize_t n = PyList_GET_SIZE(val);
+        if (n == 0) {
+            PyErr_SetString(PyExc_ValueError,
+                            "cannot compute fingerprint of empty list");
+            return -1;
+        }
+        /* Only the first item is considered, as in typeof.py */
+        TRY(string_writer_put_char, w, OP_LIST);
+        TRY(compute_fingerprint, w, PyList_GET_ITEM(val, 0));
+        return 0;
+    }
+    /* Note we only accept sets, not frozensets */
+    if (Py_TYPE(val) == &PySet_Type) {
+        Py_hash_t h;
+        PyObject *item;
+        Py_ssize_t pos = 0;
+        /* Only one item is considered, as in typeof.py */
+        if (!_PySet_NextEntry(val, &pos, &item, &h)) {
+            /* Empty set */
+            PyErr_SetString(PyExc_ValueError,
+                            "cannot compute fingerprint of empty set");
+            return -1;
+        }
+        TRY(string_writer_put_char, w, OP_SET);
+        TRY(compute_fingerprint, w, item);
+        return 0;
+    }
+    if (PyObject_CheckBuffer(val)) {
+        Py_buffer buf;
+        int flags = PyBUF_ND | PyBUF_STRIDES | PyBUF_FORMAT;
+        char contig;
+        int ndim;
+        char readonly;
+
+        /* Attempt to get a writable buffer, then fallback on read-only */
+        if (PyObject_GetBuffer(val, &buf, flags | PyBUF_WRITABLE)) {
+            PyErr_Clear();
+            if (PyObject_GetBuffer(val, &buf, flags))
+                goto _unrecognized;
+        }
+        if (PyBuffer_IsContiguous(&buf, 'C'))
+            contig = 'C';
+        else if (PyBuffer_IsContiguous(&buf, 'F'))
+            contig = 'F';
+        else
+            contig = 'A';
+        ndim = buf.ndim;
+        readonly = buf.readonly ? 'R' : 'W';
+        if (string_writer_put_char(w, OP_BUFFER) ||
+            string_writer_put_int32(w, ndim) ||
+            string_writer_put_char(w, contig) ||
+            string_writer_put_char(w, readonly) ||
+            string_writer_put_string(w, buf.format) ||
+            /* We serialize the object's Python type as well, to
+               distinguish between types which have Numba specializations
+               (e.g. array.array() vs. memoryview)
+            */
+            string_writer_put_intp(w, (npy_intp) Py_TYPE(val))) {
+            PyBuffer_Release(&buf);
+            return -1;
+        }
+        PyBuffer_Release(&buf);
+        return 0;
+    }
+
+    /* Skip numpy array descriptor check to prevent segfault */
+    // if (NUMBA_PyArray_DescrCheck(val)) {
+    //     TRY(string_writer_put_char, w, OP_NP_DTYPE);
+    //     return compute_dtype_fingerprint(w, (PyArray_Descr *) val);
+    // }
+
+_unrecognized:
+    /* Type not recognized */
+    return fingerprint_unrecognized();
+}
+
+PyObject *
+typeof_compute_fingerprint(PyObject *val)
+{
+    PyObject *res;
+    string_writer_t w;
+
+    string_writer_init(&w);
+
+    if (compute_fingerprint(&w, val))
+        goto error;
+    res = PyBytes_FromStringAndSize(w.buf, w.n);
+
+    string_writer_clear(&w);
+    return res;
+
+error:
+    string_writer_clear(&w);
+    return NULL;
+}
+
+/*
+ * Getting the typecode from a Type object.
+ */
+static int
+_typecode_from_type_object(PyObject *tyobj) {
+    int typecode;
+    PyObject *tmpcode = PyObject_GetAttrString(tyobj, "_code");
+    if (tmpcode == NULL) {
+        return -1;
+    }
+    typecode = PyLong_AsLong(tmpcode);
+    Py_DECREF(tmpcode);
+    return typecode;
+}
+
+/* When we want to cache the type's typecode for later lookup, we need to
+   keep a reference to the returned type object so that it cannot be
+   deleted. This is because of the following events occurring when first
+   using a @jit function for a given set of types:
+
+    1. typecode_fallback requests a new typecode for an arbitrary Python value;
+       this implies creating a Numba type object (on the first dispatcher call);
+       the typecode cache is then populated.
+    2. matching of the typecode list in _dispatcherimpl.cpp fails, since the
+       typecode is new.
+    3. we have to compile: compile_and_invoke() is called, it will invoke
+       Dispatcher_Insert to register the new signature.
+
+   The reference to the Numba type object returned in step 1 is deleted as
+   soon as we call Py_DECREF() on it, since we are holding the only
+   reference. If this happens and we use the typecode we got to populate the
+   cache, then the cache won't ever return the correct typecode, and the
+   dispatcher will never successfully match the typecodes with those of
+   some already-compiled instance. So we need to make sure that we don't
+   call Py_DECREF() on objects whose typecode will be used to populate the
+   cache. This is ensured by calling _typecode_fallback with
+   retain_reference == 0.
+
+   Note that technically we are leaking the reference, since we do not continue
+   to hold a pointer to the type object that we get back from typeof_pyval.
+   However, we don't need to refer to it again, we just need to make sure that
+   it is never deleted.
+*/
+static int
+_typecode_fallback(PyObject *dispatcher, PyObject *val,
+                   int retain_reference) {
+    PyObject *numba_type;
+    int typecode;
+
+    /*
+     * For values that define "_numba_type_", which holds a numba Type
+     * instance that should be used as the type of the value.
+     * Note this is done here, not in typeof_typecode(), so that
+     * some values can still benefit from fingerprint caching.
+     */
+    if (str_numba_type != NULL && PyObject_HasAttr(val, str_numba_type)) {
+        numba_type = PyObject_GetAttrString(val, "_numba_type_");
+        if (!numba_type)
+            return -1;
+    }
+    else {
+        // Go back to the interpreter
+        numba_type = PyObject_CallMethodObjArgs((PyObject *) dispatcher,
+                                                str_typeof_pyval, val, NULL);
+    }
+    if (!numba_type)
+        return -1;
+    typecode = _typecode_from_type_object(numba_type);
+    if (!retain_reference)
+        Py_DECREF(numba_type);
+    return typecode;
+}
+
+/* Variations on _typecode_fallback for convenience */
+
+static
+int typecode_fallback(PyObject *dispatcher, PyObject *val) {
+    return _typecode_fallback(dispatcher, val, 0);
+}
+
+static
+int typecode_fallback_keep_ref(PyObject *dispatcher, PyObject *val) {
+    return _typecode_fallback(dispatcher, val, 1);
+}
+
+
+/* A cache mapping fingerprints (string_writer_t *) to typecodes (int). */
+static _Numba_hashtable_t *fingerprint_hashtable = NULL;
+
+static Py_uhash_t
+hash_writer(const void *key)
+{
+    string_writer_t *writer = (string_writer_t *) key;
+    Py_uhash_t x = 0;
+
+    /* The old FNV algorithm used by Python 2 */
+    if (writer->n > 0) {
+        unsigned char *p = (unsigned char *) writer->buf;
+        Py_ssize_t len = writer->n;
+        x ^= *p << 7;
+        while (--len >= 0)
+            x = (1000003*x) ^ *p++;
+        x ^= writer->n;
+        if (x == (Py_uhash_t) -1)
+            x = -2;
+    }
+    return x;
+}
+
+static int
+compare_writer(const void *key, const _Numba_hashtable_entry_t *entry)
+{
+    string_writer_t *v = (string_writer_t *) key;
+    string_writer_t *w = (string_writer_t *) entry->key;
+    if (v->n != w->n)
+        return 0;
+    return memcmp(v->buf, w->buf, v->n) == 0;
+}
+
+/* Try to compute *val*'s typecode using its fingerprint and the
+ * fingerprint->typecode cache.
+ */
+static int
+typecode_using_fingerprint(PyObject *dispatcher, PyObject *val)
+{
+    int typecode;
+    string_writer_t w;
+
+    string_writer_init(&w);
+
+    if (compute_fingerprint(&w, val)) {
+        string_writer_clear(&w);
+        if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
+            /* Can't compute a type fingerprint for the given value,
+               fall back on typeof() without caching. */
+            PyErr_Clear();
+            return typecode_fallback(dispatcher, val);
+        }
+        return -1;
+    }
+
+    /* Check if hashtable is initialized */
+    if (fingerprint_hashtable == NULL) {
+        string_writer_clear(&w);
+        return typecode_fallback(dispatcher, val);
+    }
+
+    if (_Numba_HASHTABLE_GET(fingerprint_hashtable, &w, typecode) > 0) {
+        /* Cache hit */
+        string_writer_clear(&w);
+        return typecode;
+    }
+
+    /* Not found in cache: invoke pure Python typeof() and cache result.
+     * Note we have to keep the type alive forever as explained
+     * above in _typecode_fallback().
+     */
+    typecode = typecode_fallback_keep_ref(dispatcher, val);
+    if (typecode >= 0) {
+        string_writer_t *key = (string_writer_t *) malloc(sizeof(string_writer_t));
+        if (key == NULL) {
+            string_writer_clear(&w);
+            PyErr_NoMemory();
+            return -1;
+        }
+        /* Ownership of the string writer's buffer will be transferred
+         * to the hash table.
+         */
+        string_writer_move(key, &w);
+        if (_Numba_HASHTABLE_SET(fingerprint_hashtable, key, typecode)) {
+            string_writer_clear(&w);
+            PyErr_NoMemory();
+            return -1;
+        }
+    }
+    return typecode;
+}
+
+
+/*
+ * Direct lookup table for extra-fast typecode resolution of simple array types.
+ */
+
+#define N_DTYPES 12
+#define N_NDIM 5    /* Fast path for up to 5D array */
+#define N_LAYOUT 3
+static int cached_arycode[N_NDIM][N_LAYOUT][N_DTYPES];
+
+/* Convert a Numpy dtype number to an internal index into cached_arycode.
+   The returned value must also be a valid index into BASIC_TYPECODES. */
+static int dtype_num_to_typecode(int type_num) {
+    int dtype;
+    switch(type_num) {
+    case NPY_INT8:
+        dtype = 0;
+        break;
+    case NPY_INT16:
+        dtype = 1;
+        break;
+    case NPY_INT32:
+        dtype = 2;
+        break;
+    case NPY_INT64:
+        dtype = 3;
+        break;
+    case NPY_UINT8:
+        dtype = 4;
+        break;
+    case NPY_UINT16:
+        dtype = 5;
+        break;
+    case NPY_UINT32:
+        dtype = 6;
+        break;
+    case NPY_UINT64:
+        dtype = 7;
+        break;
+    case NPY_FLOAT32:
+        dtype = 8;
+        break;
+    case NPY_FLOAT64:
+        dtype = 9;
+        break;
+    case NPY_COMPLEX64:
+        dtype = 10;
+        break;
+    case NPY_COMPLEX128:
+        dtype = 11;
+        break;
+    default:
+        /* Type not included in the global lookup table */
+        dtype = -1;
+    }
+    return dtype;
+}
+
+static
+int get_cached_typecode(PyArray_Descr* descr) {
+    PyObject* tmpobject = PyDict_GetItem(typecache, (PyObject*)descr);
+    if (tmpobject == NULL)
+        return -1;
+
+    return PyLong_AsLong(tmpobject);
+}
+
+static
+void cache_typecode(PyArray_Descr* descr, int typecode) {
+    PyObject* value = PyLong_FromLong(typecode);
+    PyDict_SetItem(typecache, (PyObject*)descr, value);
+    Py_DECREF(value);
+}
+
+static
+PyObject* ndarray_key(int ndim, int layout, int readonly, PyArray_Descr* descr) {
+    PyObject* tmpndim = PyLong_FromLong(ndim);
+    PyObject* tmplayout = PyLong_FromLong(layout);
+    PyObject* tmpreadonly = PyLong_FromLong(readonly);
+    PyObject* key = PyTuple_Pack(4, tmpndim, tmplayout, tmpreadonly, descr);
+    Py_DECREF(tmpndim);
+    Py_DECREF(tmplayout);
+    Py_DECREF(tmpreadonly);
+    return key;
+}
+
+static
+int get_cached_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr) {
+    PyObject* key = ndarray_key(ndim, layout, readonly, descr);
+    PyObject *tmpobject = PyDict_GetItem(ndarray_typecache, key);
+    if (tmpobject == NULL)
+        return -1;
+
+    Py_DECREF(key);
+    return PyLong_AsLong(tmpobject);
+}
+
+static
+void cache_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr,
+                            int typecode) {
+    PyObject* key = ndarray_key(ndim, layout, readonly, descr);
+    PyObject* value = PyLong_FromLong(typecode);
+    PyDict_SetItem(ndarray_typecache, key, value);
+    Py_DECREF(key);
+    Py_DECREF(value);
+}
+
+static
+int typecode_ndarray(PyObject *dispatcher, PyArrayObject *ary) {
+    int typecode;
+    int dtype;
+    int ndim = PyArray_NDIM(ary);
+    int layout = 0;
+    int readonly = 0;
+
+    /* The order in which we check for the right contiguous-ness is important.
+       The order must match the order by numba.numpy_support.map_layout.
+       Further, only *contiguous-ness* is checked, not alignment, byte order or
+       write permissions.
+    */
+    if (PyArray_IS_C_CONTIGUOUS(ary)){
+        layout = 1;
+    } else if (PyArray_IS_F_CONTIGUOUS(ary)) {
+        layout = 2;
+    }
+
+    /* the typecode cache by convention is for "behaved" arrays (aligned and
+     * writeable), all others must be forced to the fall back */
+    if (!PyArray_ISBEHAVED(ary)) goto FALLBACK;
+
+    if (ndim <= 0 || ndim > N_NDIM) goto FALLBACK;
+
+    dtype = dtype_num_to_typecode(PyArray_TYPE(ary));
+    if (dtype == -1) goto FALLBACK;
+
+    /* Fast path, using direct table lookup */
+    assert(layout < N_LAYOUT);
+    assert(ndim <= N_NDIM);
+    assert(dtype < N_DTYPES);
+
+    typecode = cached_arycode[ndim - 1][layout][dtype];
+    if (typecode == -1) {
+        /* First use of this table entry, so it requires populating */
+        typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
+        cached_arycode[ndim - 1][layout][dtype] = typecode;
+    }
+    return typecode;
+
+FALLBACK:
+    /* Slower path, for non-trivial array types */
+
+    /* If this isn't a structured array then we can't use the cache */
+    if (PyArray_TYPE(ary) != NPY_VOID)
+        return typecode_using_fingerprint(dispatcher, (PyObject *) ary);
+
+    /* Check type cache */
+    readonly = !PyArray_ISWRITEABLE(ary);
+    typecode = get_cached_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary));
+    if (typecode == -1) {
+        /* First use of this type, use fallback and populate the cache */
+        typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
+        cache_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary), typecode);
+    }
+    return typecode;
+}
+
+static
+int typecode_arrayscalar(PyObject *dispatcher, PyObject* aryscalar) {
+    int typecode;
+    PyArray_Descr *descr;
+    descr = PyArray_DescrFromScalar(aryscalar);
+    if (!descr)
+        return typecode_using_fingerprint(dispatcher, aryscalar);
+
+    /* Is it a structured scalar? */
+    if (descr->type_num == NPY_VOID) {
+        typecode = get_cached_typecode(descr);
+        if (typecode == -1) {
+            /* Resolve through fallback then populate cache */
+            typecode = typecode_fallback_keep_ref(dispatcher, aryscalar);
+            cache_typecode(descr, typecode);
+        }
+        Py_DECREF(descr);
+        return typecode;
+    }
+
+    /* Is it one of the well-known basic types? */
+    typecode = dtype_num_to_typecode(descr->type_num);
+    Py_DECREF(descr);
+    if (typecode == -1)
+        return typecode_using_fingerprint(dispatcher, aryscalar);
+    return BASIC_TYPECODES[typecode];
+}
+
+static
+int typecode_devicendarray(PyObject *dispatcher, PyObject *ary)
+{
+    int typecode;
+    int dtype;
+    int ndim;
+    int layout = 0;
+    PyObject *ndim_obj = nullptr;
+    PyObject *num_obj = nullptr;
+    PyObject *dtype_obj = nullptr;
+    int dtype_num = 0;
+
+    PyObject* flags = PyObject_GetAttrString(ary, "flags");
+    if (flags == NULL)
+    {
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    if (PyDict_GetItemString(flags, "C_CONTIGUOUS") == Py_True) {
+        layout = 1;
+    } else if (PyDict_GetItemString(flags, "F_CONTIGUOUS") == Py_True) {
+        layout = 2;
+    }
+
+    Py_DECREF(flags);
+
+    ndim_obj = PyObject_GetAttrString(ary, "ndim");
+    if (ndim_obj == NULL) {
+        /* If there's no ndim, try to proceed by clearing the error and using the
+         * fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    ndim = PyLong_AsLong(ndim_obj);
+    Py_DECREF(ndim_obj);
+
+    if (PyErr_Occurred()) {
+        /* ndim wasn't an integer for some reason - unlikely to happen, but try
+         * the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    if (ndim <= 0 || ndim > N_NDIM)
+        goto FALLBACK;
+
+    dtype_obj = PyObject_GetAttrString(ary, "dtype");
+    if (dtype_obj == NULL) {
+        /* No dtype: try the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    num_obj = PyObject_GetAttrString(dtype_obj, "num");
+    Py_DECREF(dtype_obj);
+
+    if (num_obj == NULL) {
+        /* This strange dtype has no num - try the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    dtype_num = PyLong_AsLong(num_obj);
+    Py_DECREF(num_obj);
+
+    if (PyErr_Occurred()) {
+        /* num wasn't an integer for some reason - unlikely to happen, but try
+         * the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    dtype = dtype_num_to_typecode(dtype_num);
+    if (dtype == -1) {
+        /* Not a dtype we have in the global lookup table. */
+        goto FALLBACK;
+    }
+
+    /* Fast path, using direct table lookup */
+    assert(layout < N_LAYOUT);
+    assert(ndim <= N_NDIM);
+    assert(dtype < N_DTYPES);
+    typecode = cached_arycode[ndim - 1][layout][dtype];
+
+    if (typecode == -1) {
+        /* First use of this table entry, so it requires populating */
+        typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
+        cached_arycode[ndim - 1][layout][dtype] = typecode;
+    }
+
+    return typecode;
+
+FALLBACK:
+    /* Slower path, for non-trivial array types. At present this always uses
+       the fingerprinting to get the typecode. Future optimization might
+       implement a cache, but this would require some fast equivalent of
+       PyArray_DESCR for a device array. */
+
+    return typecode_using_fingerprint(dispatcher, (PyObject *) ary);
+}
+
+extern "C" int
+typeof_typecode(PyObject *dispatcher, PyObject *val)
+{
+    int subtype_attr;
+    /* This needs to be kept in sync with Dispatcher.typeof_pyval(),
+     * otherwise funny things may happen.
+     */
+
+    if (PyLong_Check(val)) {
+#if SIZEOF_VOID_P < 8
+        /* On 32-bit platforms, choose between tc_intp (32-bit) and tc_int64 */
+        PY_LONG_LONG ll = PyLong_AsLongLong(val);
+        if (ll == -1 && PyErr_Occurred()) {
+            /* The integer is too large, let us truncate it */
+            PyErr_Clear();
+            return tc_int64;
+        }
+        if ((ll & 0xffffffff) != ll)
+            return tc_int64;
+#endif
+        return tc_intp;
+    }
+    else if (PyFloat_Check(val))
+        return tc_float64;
+    else if (PyComplex_Check(val))
+        return tc_complex128;
+    /* Skip all problematic array checks for now to avoid segfault */
+    // /* Array scalar handling */
+    // else if (PyArray_CheckScalar(val)) {
+    //     return typecode_arrayscalar(dispatcher, val);
+    // }
+    // /* Array handling */
+    // else if (tyobj == &PyArray_Type) {
+    //     return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+    // }
+    // /* Subtype of CUDA device array */
+    // else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) {
+    //     return typecode_devicendarray(dispatcher, val);
+    // }
+    // /* Subtypes of Array handling */
+    // else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
+    //     /* By default, Numba will treat all numpy.ndarray subtypes as if they
+    //        were the base numpy.ndarray type.  In this way, ndarray subtypes
+    //        can easily use all of the support that Numba has for ndarray
+    //        methods.
+    //        EXPERIMENTAL: There may be cases where a programmer would NOT want
+    //        ndarray subtypes to be treated exactly like the base numpy.ndarray.
+    //        For this purpose, a currently experimental feature allows a
+    //        programmer to add an attribute named
+    //        __numba_array_subtype_dispatch__ to their ndarray subtype.  This
+    //        attribute can have any value as Numba only checks for the presence
+    //        of the attribute and not its value.  When present, a ndarray subtype
+    //        will NOT be typed by Numba as a regular ndarray but this code will
+    //        fallthrough to the typecode_using_fingerprint call, which will
+    //        create a new unique Numba typecode for this ndarray subtype.  This
+    //        behavior has several significant effects.  First, since this
+    //        ndarray subtype will be treated as a different type by Numba,
+    //        the Numba dispatcher would then specialize on this type.  So, if
+    //        there was a function that had several parameters that were
+    //        expected to be either numpy.ndarray or a subtype of ndarray, then
+    //        Numba would compile a custom version of this function for each
+    //        combination of base and subtypes that were actually passed to the
+    //        function.  Second, because this subtype would now be treated as
+    //        a totally separate type, it will cease to function in Numba unless
+    //        an implementation of that type is provided to Numba through the
+    //        Numba type extension mechanisms (e.g., overload).  This would
+    //        typically start with defining a Numba type corresponding to the
+    //        ndarray subtype. This is the same concept as how Numba has a
+    //        corollary of numpy.ndarray in its type system as types.Array.
+    //        Next, one would typically defining boxing and unboxing routines
+    //        and the associated memory model.  Then, overloads for NumPy
+    //        functions on that type would be created.  However,
+    //        if the same default array memory model is used then there are tricks
+    //        one can do to look at Numba's internal types.Array registries and
+    //        to quickly apply those to the subtype as well.  In this manner,
+    //        only those cases where the base ndarray and the ndarray subtype
+    //        behavior differ would new custom functions need to be written for
+    //        the subtype. Finally,
+    //        after adding support for the new type, you would have a separate
+    //        ndarray subtype that could operate with other objects of the same
+    //        subtype but would not support interoperation with regular NumPy
+    //        ndarrays.  In standard Python, this interoperation is provided
+    //        through the __array_ufunc__ magic method in the ndarray subtype
+    //        class and in that case the function operates on ndarrays or their
+    //        subtypes.  This idea is extended into Numba such that
+    //        __array_ufunc__ can be present in a Numba array type object.
+    //        In this case, this function is consulted during Numba typing and
+    //        so the arguments to __array_ufunc__ are Numba types instead of
+    //        ndarray subtypes.  The array type __array_ufunc__ returns the
+    //        type of the output of the given ufunc.
+    //      */
+    //     subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__");
+    //     if (!subtype_attr) {
+    //         return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+    //     }
+    // }
+
+    return typecode_using_fingerprint(dispatcher, val);
+}
+
+
+static
+void* wrap_import_array(void) {
+    import_array(); /* import array returns NULL on failure */
+    return (void*)1;
+}
+
+
+static
+int init_numpy(void) {
+    return wrap_import_array() != NULL;
+}
+
+
+/*
+ * typeof_init(omittedarg_type, typecode_dict)
+ * (called from dispatcher.py to fill in missing information)
+ */
+extern "C" PyObject *
+typeof_init(PyObject *self, PyObject *args)
+{
+    PyObject *tmpobj;
+    PyObject *dict;
+    int index = 0;
+
+    if (!PyArg_ParseTuple(args, "O!O!:typeof_init",
+                          &PyType_Type, &omittedarg_type,
+                          &PyDict_Type, &dict))
+        return NULL;
+
+    /* Initialize Numpy API */
+    if ( ! init_numpy() ) {
+        return NULL;
+    }
+
+    #define UNWRAP_TYPE(S)                                              \
+        if(!(tmpobj = PyDict_GetItemString(dict, #S))) return NULL;     \
+        else {  tc_##S = PyLong_AsLong(tmpobj);                         \
+                BASIC_TYPECODES[index++] = tc_##S;  }
+
+    UNWRAP_TYPE(int8)
+    UNWRAP_TYPE(int16)
+    UNWRAP_TYPE(int32)
+    UNWRAP_TYPE(int64)
+
+    UNWRAP_TYPE(uint8)
+    UNWRAP_TYPE(uint16)
+    UNWRAP_TYPE(uint32)
+    UNWRAP_TYPE(uint64)
+
+    UNWRAP_TYPE(float32)
+    UNWRAP_TYPE(float64)
+
+    UNWRAP_TYPE(complex64)
+    UNWRAP_TYPE(complex128)
+
+    switch(sizeof(void*)) {
+    case 4:
+        tc_intp = tc_int32;
+        break;
+    case 8:
+        tc_intp = tc_int64;
+        break;
+    default:
+        PyErr_SetString(PyExc_AssertionError, "sizeof(void*) != {4, 8}");
+        return NULL;
+    }
+
+    #undef UNWRAP_TYPE
+
+    typecache = PyDict_New();
+    ndarray_typecache = PyDict_New();
+    structured_dtypes = PyDict_New();
+    if (typecache == NULL || ndarray_typecache == NULL ||
+        structured_dtypes == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "failed to create type cache");
+        return NULL;
+    }
+
+    fingerprint_hashtable = _Numba_hashtable_new(sizeof(int),
+                                              hash_writer,
+                                              compare_writer);
+    if (fingerprint_hashtable == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /* initialize cached_arycode to all ones (in bits) */
+    memset(cached_arycode, 0xFF, sizeof(cached_arycode));
+
+    str_typeof_pyval = PyString_InternFromString("typeof_pyval");
+    str_value = PyString_InternFromString("value");
+    str_numba_type = PyString_InternFromString("_numba_type_");
+    if (!str_value || !str_typeof_pyval || !str_numba_type)
+        return NULL;
+
+    Py_RETURN_NONE;
+}
diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.h b/numba_cuda/numba/cuda/_dispatcher/_typeof.h
new file mode 100644
index 000000000..6e0039b5f
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.h
@@ -0,0 +1,16 @@
+#ifndef NUMBA_TYPEOF_H_
+#define NUMBA_TYPEOF_H_
+
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+extern PyObject *typeof_init(PyObject *self, PyObject *args);
+extern int typeof_typecode(PyObject *dispatcher, PyObject *val);
+extern PyObject *typeof_compute_fingerprint(PyObject *val);
+
+#ifdef __cplusplus
+    }
+#endif
+
+#endif  /* NUMBA_TYPEOF_H_ */
diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp b/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp
new file mode 100644
index 000000000..3c51fdfa7
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp
@@ -0,0 +1,209 @@
+#include <cstring>
+#include <cstdio>
+#include <algorithm>
+#include <limits.h>
+
+#include "typeconv.hpp"
+
+
+// ------ TypeManager ------
+
+TCCMap::TCCMap()
+    : nb_records(0)
+{
+}
+
+size_t TCCMap::hash(const TypePair &key) const {
+    return std::hash<size_t>()(std::hash<Type>()(key.first)) ^
+           std::hash<Type>()(key.second);
+}
+
+void TCCMap::insert(const TypePair &key, TypeCompatibleCode val) {
+    size_t i = hash(key) & (TCCMAP_SIZE - 1);
+    TCCMapBin &bin = records[i];
+    for (unsigned int j = 0; j < bin.size(); ++j) {
+        if (bin[j].key == key) {
+            bin[j].val = val;
+            return;
+        }
+    }
+    bin.push_back({key, val});
+    nb_records++;
+}
+
+TypeCompatibleCode TCCMap::find(const TypePair &key) const {
+    size_t i = hash(key) & (TCCMAP_SIZE - 1);
+    const TCCMapBin &bin = records[i];
+    for (unsigned int j = 0; j < bin.size(); ++j) {
+        if (bin[j].key == key) {
+            return bin[j].val;
+        }
+    }
+    return TCC_FALSE;
+}
+
+// ----- Ratings -----
+Rating::Rating() : promote(0), safe_convert(0), unsafe_convert(0) { }
+
+inline bool Rating::operator < (const Rating &other) const {
+    if (unsafe_convert < other.unsafe_convert)
+        return true;
+    else if (unsafe_convert > other.unsafe_convert)
+        return false;
+    if (safe_convert < other.safe_convert)
+        return true;
+    else if (safe_convert > other.safe_convert)
+        return false;
+    return (promote < other.promote);
+}
+
+inline bool Rating::operator == (const Rating &other) const {
+    return promote == other.promote && safe_convert == other.safe_convert &&
+           unsafe_convert == other.unsafe_convert;
+}
+
+// ------ TypeManager ------
+
+bool TypeManager::canPromote(Type from, Type to) const {
+    return isCompatible(from, to) == TCC_PROMOTE;
+}
+
+bool TypeManager::canSafeConvert(Type from, Type to) const {
+    return isCompatible(from, to) == TCC_CONVERT_SAFE;
+}
+
+bool TypeManager::canUnsafeConvert(Type from, Type to) const {
+    return isCompatible(from, to) == TCC_CONVERT_UNSAFE;
+}
+
+void TypeManager::addPromotion(Type from, Type to) {
+    return addCompatibility(from, to, TCC_PROMOTE);
+}
+
+void TypeManager::addUnsafeConversion(Type from, Type to) {
+    return addCompatibility(from, to, TCC_CONVERT_UNSAFE);
+}
+
+void TypeManager::addSafeConversion(Type from, Type to) {
+    return addCompatibility(from, to, TCC_CONVERT_SAFE);
+}
+
+void TypeManager::addCompatibility(Type from, Type to, TypeCompatibleCode tcc) {
+    TypePair pair(from, to);
+    tccmap.insert(pair, tcc);
+}
+
+TypeCompatibleCode TypeManager::isCompatible(Type from, Type to) const {
+    if (from == to)
+        return TCC_EXACT;
+    TypePair pair(from, to);
+    return tccmap.find(pair);
+}
+
+
+int TypeManager::selectOverload(const Type sig[], const Type ovsigs[],
+                                int &selected,
+                                int sigsz, int ovct, bool allow_unsafe,
+                                bool exact_match_required
+                               ) const {
+    int count;
+    if (ovct <= 16) {
+        Rating ratings[16];
+        int candidates[16];
+        count = _selectOverload(sig, ovsigs, selected, sigsz, ovct,
+                                allow_unsafe, exact_match_required, ratings,
+                                candidates);
+    }
+    else {
+        Rating *ratings = new Rating[ovct];
+        int *candidates = new int[ovct];
+        count = _selectOverload(sig, ovsigs, selected, sigsz, ovct,
+                                allow_unsafe, exact_match_required, ratings,
+                                candidates);
+        delete [] ratings;
+        delete [] candidates;
+    }
+    return count;
+}
+
+int TypeManager::_selectOverload(const Type sig[], const Type ovsigs[],
+                                 int &selected, int sigsz, int ovct,
+                                 bool allow_unsafe, bool exact_match_required,
+                                 Rating ratings[], int candidates[]) const {
+    // Generate rating table
+    // Use a penalize scheme.
+    int nb_candidates = 0;
+
+    for (int i = 0; i < ovct; ++i) {
+        const Type *entry = &ovsigs[i * sigsz];
+        Rating rate;
+
+        for (int j = 0; j < sigsz; ++j) {
+            TypeCompatibleCode tcc = isCompatible(sig[j], entry[j]);
+            if (tcc == TCC_FALSE ||
+                (tcc == TCC_CONVERT_UNSAFE && !allow_unsafe) ||
+                (tcc != TCC_EXACT && exact_match_required)) {
+                // stop the loop early
+                goto _incompatible;
+            }
+            switch(tcc) {
+            case TCC_PROMOTE:
+                rate.promote += 1;
+                break;
+            case TCC_CONVERT_SAFE:
+                rate.safe_convert += 1;
+                break;
+            case TCC_CONVERT_UNSAFE:
+                rate.unsafe_convert += 1;
+                break;
+            default:
+                break;
+            }
+        }
+        ratings[nb_candidates] = rate;
+        candidates[nb_candidates] = i;
+        nb_candidates++;
+    _incompatible:
+        ;
+    }
+
+    // Bail if no match
+    if (nb_candidates == 0)
+        return 0;
+
+    // Find lowest rating
+    Rating best = ratings[0];
+    selected = candidates[0];
+
+    int matchcount = 1;
+    for (int i = 1; i < nb_candidates; ++i) {
+        if (ratings[i] < best) {
+            best = ratings[i];
+            selected = candidates[i];
+            matchcount = 1;
+        }
+        else if (ratings[i] == best) {
+            matchcount += 1;
+        }
+    }
+    return matchcount;
+}
+
+// ----- utils -----
+
+const char* TCCString(TypeCompatibleCode tcc) {
+    switch(tcc) {
+    case TCC_EXACT:
+        return "exact";
+    case TCC_SUBTYPE:
+        return "subtype";
+    case TCC_PROMOTE:
+        return "promote";
+    case TCC_CONVERT_SAFE:
+        return "safe_convert";
+    case TCC_CONVERT_UNSAFE:
+        return "unsafe_convert";
+    default:
+        return "false";
+    }
+}
diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp b/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp
new file mode 100644
index 000000000..1f3cb9359
--- /dev/null
+++ b/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp
@@ -0,0 +1,98 @@
+#ifndef NUMBA_TYPECONV_HPP_
+#define NUMBA_TYPECONV_HPP_
+#include <string>
+#include <vector>
+
+
+typedef int Type;
+
+enum TypeCompatibleCode{
+    // No match
+    TCC_FALSE = 0,
+    // Exact match
+    TCC_EXACT,
+    // Subtype is UNUSED
+    TCC_SUBTYPE,
+    // Promotion with no precision loss
+    TCC_PROMOTE,
+    // Conversion with no precision loss
+    // e.g. int32 to double
+    TCC_CONVERT_SAFE,
+    // Conversion with precision loss
+    // e.g. int64 to double (53 bits precision)
+    TCC_CONVERT_UNSAFE,
+};
+
+typedef std::pair<Type, Type> TypePair;
+
+struct TCCRecord {
+    TypePair key;
+    TypeCompatibleCode val;
+};
+
+typedef std::vector<TCCRecord> TCCMapBin;
+
+class TCCMap {
+public:
+    TCCMap();
+
+    void insert(const TypePair &key, TypeCompatibleCode val);
+    TypeCompatibleCode find(const TypePair &key) const;
+private:
+    size_t hash(const TypePair &key) const;
+
+    /* Must be a power of two */
+    static const size_t TCCMAP_SIZE = 512;
+    TCCMapBin records[TCCMAP_SIZE];
+    int nb_records;
+};
+
+struct Rating {
+    unsigned int promote;
+    unsigned int safe_convert;
+    unsigned int unsafe_convert;
+
+    Rating();
+
+    bool operator < (const Rating &other) const;
+    bool operator == (const Rating &other) const;
+};
+
+
+class TypeManager{
+public:
+    bool canPromote(Type from, Type to) const;
+    bool canUnsafeConvert(Type from, Type to) const;
+    bool canSafeConvert(Type from, Type to) const;
+
+    void addPromotion(Type from, Type to);
+    void addUnsafeConversion(Type from, Type to);
+    void addSafeConversion(Type from, Type to);
+    void addCompatibility(Type from, Type to, TypeCompatibleCode by);
+
+    TypeCompatibleCode isCompatible(Type from, Type to) const;
+
+    /**
+    Output stored in selected.
+    Returns
+        Number of matches
+    */
+    int selectOverload(const Type sig[], const Type ovsigs[], int &selected,
+                       int sigsz, int ovct, bool allow_unsafe,
+                       bool exact_match_required
+                      ) const;
+
+private:
+    int _selectOverload(const Type sig[], const Type ovsigs[], int &selected,
+                        int sigsz, int ovct, bool allow_unsafe,
+                        bool exact_match_required,
+                        Rating ratings[], int candidates[]) const;
+
+    TCCMap tccmap;
+};
+
+
+const char* TCCString(TypeCompatibleCode tcc);
+
+
+#endif // NUMBA_TYPECONV_HPP_
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 68706bfd1..d02beffb0 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -38,7 +38,7 @@
 from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
 
 from numba import cuda
-from numba import _dispatcher
+from numba.cuda import _dispatcher
 
 from warnings import warn
 
diff --git a/pyproject.toml b/pyproject.toml
index 3fd0f65fc..e0c2da629 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@ build-backend = "setuptools.build_meta"
 requires = [
     "setuptools",
     "wheel",
+    "numpy",
 ]
 
 [project]
diff --git a/setup.py b/setup.py
index bfb11f27a..31f66ae0e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,103 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pathlib
+import sys
 
-from setuptools import setup
+from setuptools import setup, Extension
 from setuptools.command.build_py import build_py
 from setuptools.command.editable_wheel import editable_wheel, _TopLevelFinder
+from setuptools.command.build_ext import build_ext
 
 REDIRECTOR_PTH = "_numba_cuda_redirector.pth"
 REDIRECTOR_PY = "_numba_cuda_redirector.py"
 SITE_PACKAGES = pathlib.Path("site-packages")
 
 
+def get_version():
+    """Read version from VERSION file."""
+    version_file = pathlib.Path(__file__).parent / "numba_cuda" / "VERSION"
+    return version_file.read_text().strip()
+
+
+def get_ext_modules():
+    """
+    Return a list of Extension instances for the setup() call.
+    """
+    # Note we don't import NumPy at the toplevel, since setup.py
+    # should be able to run without NumPy for pip to discover the
+    # build dependencies. Need NumPy headers and libm linkage.
+    import numpy as np
+
+    np_compile_args = {
+        "include_dirs": [
+            np.get_include(),
+        ],
+    }
+    if sys.platform != "win32":
+        np_compile_args["libraries"] = [
+            "m",
+        ]
+
+    dispatcher_sources = [
+        "numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp",
+        "numba_cuda/numba/cuda/_dispatcher/_typeof.cpp",
+        "numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp",
+        "numba_cuda/numba/cuda/_dispatcher/typeconv.cpp",
+    ]
+    ext_dispatcher = Extension(
+        name="numba_cuda.numba.cuda._dispatcher",
+        sources=dispatcher_sources,
+        depends=[
+            "numba_cuda/numba/cuda/_dispatcher/_pymodule.h",
+            "numba_cuda/numba/cuda/_dispatcher/_typeof.h",
+            "numba_cuda/numba/cuda/_dispatcher/_hashtable.h",
+        ],
+        extra_compile_args=["-std=c++11"],
+        **np_compile_args,
+    )
+
+    # Add our include directory to the existing include_dirs
+    ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/_dispatcher")
+
+    return [ext_dispatcher]
+
+
+def is_building():
+    """
+    Parse the setup.py command and return whether a build is requested.
+    If False is returned, only an informational command is run.
+    If True is returned, information about C extensions will have to
+    be passed to the setup() function.
+    """
+    if len(sys.argv) < 2:
+        # User forgot to give an argument probably, let setuptools handle that.
+        return True
+
+    build_commands = [
+        "build",
+        "build_py",
+        "build_ext",
+        "build_clibbuild_scripts",
+        "install",
+        "install_lib",
+        "install_headers",
+        "install_scripts",
+        "install_data",
+        "sdist",
+        "bdist",
+        "bdist_dumb",
+        "bdist_rpm",
+        "bdist_wininst",
+        "check",
+        "build_doc",
+        "bdist_wheel",
+        "bdist_egg",
+        "develop",
+        "easy_install",
+        "test",
+    ]
+    return any(bc in sys.argv[1:] for bc in build_commands)
+
+
 # Adapted from https://stackoverflow.com/a/71137790
 class build_py_with_redirector(build_py):  # noqa: N801
     """Include the redirector files in the generated wheel."""
@@ -70,9 +158,52 @@ def _select_strategy(self, name, tag, build_lib):
         return TopLevelFinderWithRedirector(self.distribution, name)
 
 
+cmdclass = {}
+
+numba_be_user_options = [
+    ("werror", None, "Build extensions with -Werror"),
+    ("wall", None, "Build extensions with -Wall"),
+    ("noopt", None, "Build extensions without optimization"),
+]
+
+
+class NumbaBuildExt(build_ext):
+    user_options = build_ext.user_options + numba_be_user_options
+    boolean_options = build_ext.boolean_options + ["werror", "wall", "noopt"]
+
+    def initialize_options(self):
+        super().initialize_options()
+        self.werror = 0
+        self.wall = 0
+        self.noopt = 0
+
+    def run(self):
+        extra_compile_args = []
+        if self.noopt:
+            if sys.platform == "win32":
+                extra_compile_args.append("/Od")
+            else:
+                extra_compile_args.append("-O0")
+        if self.werror:
+            extra_compile_args.append("-Werror")
+        if self.wall:
+            extra_compile_args.append("-Wall")
+        for ext in self.extensions:
+            ext.extra_compile_args.extend(extra_compile_args)
+
+        super().run()
+
+
+cmdclass["build_ext"] = NumbaBuildExt
+cmdclass["build_py"] = build_py_with_redirector
+cmdclass["editable_wheel"] = editable_wheel_with_redirector
+
+if is_building():
+    ext_modules = get_ext_modules()
+else:
+    ext_modules = []
+
 setup(
-    cmdclass={
-        "build_py": build_py_with_redirector,
-        "editable_wheel": editable_wheel_with_redirector,
-    }
+    cmdclass=cmdclass,
+    ext_modules=ext_modules,
 )

From e0c2ea8074477436d2c874a6b451d30580b3207f Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Mon, 4 Aug 2025 22:08:29 -0700
Subject: [PATCH 03/62] [Refactor][NFC] _dispatcher cext: Re-enable array
 handling, invoke typeof_init

---
 numba_cuda/numba/cuda/_dispatcher/_typeof.cpp | 227 +++++++++---------
 numba_cuda/numba/cuda/dispatcher.py           |  13 +
 2 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
index 99a1bd23e..16c5009c4 100644
--- a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
+++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
@@ -366,38 +366,35 @@ compute_fingerprint(string_writer_t *w, PyObject *val)
         Py_DECREF(default_val);
         return 0;
     }
-
-    /* Skip numpy scalar check to prevent segfault */
-    // if (PyArray_IsScalar(val, Generic)) {
-    //     PyArray_Descr *descr = PyArray_DescrFromScalar(val);
-    //     if (descr == NULL)
-    //         return -1;
-    //     TRY(string_writer_put_char, w, OP_NP_SCALAR);
-    //     TRY(compute_dtype_fingerprint, w, descr);
-    //     Py_DECREF(descr);
-    //     return 0;
-    // }
-
-    /* Skip numpy array check to prevent segfault */
-    // if (PyArray_Check(val)) {
-    //     PyArrayObject *ary = (PyArrayObject *) val;
-    //     int ndim = PyArray_NDIM(ary);
-
-    //     TRY(string_writer_put_char, w, OP_NP_ARRAY);
-    //     TRY(string_writer_put_int32, w, ndim);
-    //     if (PyArray_IS_C_CONTIGUOUS(ary))
-    //         TRY(string_writer_put_char, w, 'C');
-    //     else if (PyArray_IS_F_CONTIGUOUS(ary))
-    //         TRY(string_writer_put_char, w, 'F');
-    //     else
-    //         TRY(string_writer_put_char, w, 'A');
-    //     if (PyArray_ISWRITEABLE(ary))
-    //         TRY(string_writer_put_char, w, 'W');
-    //     else
-    //         TRY(string_writer_put_char, w, 'R');
-    //     return compute_dtype_fingerprint(w, PyArray_DESCR(ary));
-    // }
-
+    if (PyArray_IsScalar(val, Generic)) {
+        /* Note: PyArray_DescrFromScalar() may be a bit slow on
+           non-trivial types. */
+        PyArray_Descr *descr = PyArray_DescrFromScalar(val);
+        if (descr == NULL)
+            return -1;
+        TRY(string_writer_put_char, w, OP_NP_SCALAR);
+        TRY(compute_dtype_fingerprint, w, descr);
+        Py_DECREF(descr);
+        return 0;
+    }
+    if (PyArray_Check(val)) {
+        PyArrayObject *ary = (PyArrayObject *) val;
+        int ndim = PyArray_NDIM(ary);
+
+        TRY(string_writer_put_char, w, OP_NP_ARRAY);
+        TRY(string_writer_put_int32, w, ndim);
+        if (PyArray_IS_C_CONTIGUOUS(ary))
+            TRY(string_writer_put_char, w, 'C');
+        else if (PyArray_IS_F_CONTIGUOUS(ary))
+            TRY(string_writer_put_char, w, 'F');
+        else
+            TRY(string_writer_put_char, w, 'A');
+        if (PyArray_ISWRITEABLE(ary))
+            TRY(string_writer_put_char, w, 'W');
+        else
+            TRY(string_writer_put_char, w, 'R');
+        return compute_dtype_fingerprint(w, PyArray_DESCR(ary));
+    }
     if (PyList_Check(val)) {
         Py_ssize_t n = PyList_GET_SIZE(val);
         if (n == 0) {
@@ -463,12 +460,10 @@ compute_fingerprint(string_writer_t *w, PyObject *val)
         PyBuffer_Release(&buf);
         return 0;
     }
-
-    /* Skip numpy array descriptor check to prevent segfault */
-    // if (NUMBA_PyArray_DescrCheck(val)) {
-    //     TRY(string_writer_put_char, w, OP_NP_DTYPE);
-    //     return compute_dtype_fingerprint(w, (PyArray_Descr *) val);
-    // }
+    if (NUMBA_PyArray_DescrCheck(val)) {
+        TRY(string_writer_put_char, w, OP_NP_DTYPE);
+        return compute_dtype_fingerprint(w, (PyArray_Descr *) val);
+    }
 
 _unrecognized:
     /* Type not recognized */
@@ -550,7 +545,7 @@ _typecode_fallback(PyObject *dispatcher, PyObject *val,
      * Note this is done here, not in typeof_typecode(), so that
      * some values can still benefit from fingerprint caching.
      */
-    if (str_numba_type != NULL && PyObject_HasAttr(val, str_numba_type)) {
+    if (PyObject_HasAttr(val, str_numba_type)) {
         numba_type = PyObject_GetAttrString(val, "_numba_type_");
         if (!numba_type)
             return -1;
@@ -635,13 +630,6 @@ typecode_using_fingerprint(PyObject *dispatcher, PyObject *val)
         }
         return -1;
     }
-
-    /* Check if hashtable is initialized */
-    if (fingerprint_hashtable == NULL) {
-        string_writer_clear(&w);
-        return typecode_fallback(dispatcher, val);
-    }
-
     if (_Numba_HASHTABLE_GET(fingerprint_hashtable, &w, typecode) > 0) {
         /* Cache hit */
         string_writer_clear(&w);
@@ -973,12 +961,12 @@ int typecode_devicendarray(PyObject *dispatcher, PyObject *ary)
 extern "C" int
 typeof_typecode(PyObject *dispatcher, PyObject *val)
 {
+    PyTypeObject *tyobj = Py_TYPE(val);
     int subtype_attr;
     /* This needs to be kept in sync with Dispatcher.typeof_pyval(),
      * otherwise funny things may happen.
      */
-
-    if (PyLong_Check(val)) {
+    if (tyobj == &PyInt_Type || tyobj == &PyLong_Type) {
 #if SIZEOF_VOID_P < 8
         /* On 32-bit platforms, choose between tc_intp (32-bit) and tc_int64 */
         PY_LONG_LONG ll = PyLong_AsLongLong(val);
@@ -992,80 +980,79 @@ typeof_typecode(PyObject *dispatcher, PyObject *val)
 #endif
         return tc_intp;
     }
-    else if (PyFloat_Check(val))
+    else if (tyobj == &PyFloat_Type)
         return tc_float64;
-    else if (PyComplex_Check(val))
+    else if (tyobj == &PyComplex_Type)
         return tc_complex128;
-    /* Skip all problematic array checks for now to avoid segfault */
-    // /* Array scalar handling */
-    // else if (PyArray_CheckScalar(val)) {
-    //     return typecode_arrayscalar(dispatcher, val);
-    // }
-    // /* Array handling */
-    // else if (tyobj == &PyArray_Type) {
-    //     return typecode_ndarray(dispatcher, (PyArrayObject*)val);
-    // }
-    // /* Subtype of CUDA device array */
-    // else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) {
-    //     return typecode_devicendarray(dispatcher, val);
-    // }
-    // /* Subtypes of Array handling */
-    // else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
-    //     /* By default, Numba will treat all numpy.ndarray subtypes as if they
-    //        were the base numpy.ndarray type.  In this way, ndarray subtypes
-    //        can easily use all of the support that Numba has for ndarray
-    //        methods.
-    //        EXPERIMENTAL: There may be cases where a programmer would NOT want
-    //        ndarray subtypes to be treated exactly like the base numpy.ndarray.
-    //        For this purpose, a currently experimental feature allows a
-    //        programmer to add an attribute named
-    //        __numba_array_subtype_dispatch__ to their ndarray subtype.  This
-    //        attribute can have any value as Numba only checks for the presence
-    //        of the attribute and not its value.  When present, a ndarray subtype
-    //        will NOT be typed by Numba as a regular ndarray but this code will
-    //        fallthrough to the typecode_using_fingerprint call, which will
-    //        create a new unique Numba typecode for this ndarray subtype.  This
-    //        behavior has several significant effects.  First, since this
-    //        ndarray subtype will be treated as a different type by Numba,
-    //        the Numba dispatcher would then specialize on this type.  So, if
-    //        there was a function that had several parameters that were
-    //        expected to be either numpy.ndarray or a subtype of ndarray, then
-    //        Numba would compile a custom version of this function for each
-    //        combination of base and subtypes that were actually passed to the
-    //        function.  Second, because this subtype would now be treated as
-    //        a totally separate type, it will cease to function in Numba unless
-    //        an implementation of that type is provided to Numba through the
-    //        Numba type extension mechanisms (e.g., overload).  This would
-    //        typically start with defining a Numba type corresponding to the
-    //        ndarray subtype. This is the same concept as how Numba has a
-    //        corollary of numpy.ndarray in its type system as types.Array.
-    //        Next, one would typically defining boxing and unboxing routines
-    //        and the associated memory model.  Then, overloads for NumPy
-    //        functions on that type would be created.  However,
-    //        if the same default array memory model is used then there are tricks
-    //        one can do to look at Numba's internal types.Array registries and
-    //        to quickly apply those to the subtype as well.  In this manner,
-    //        only those cases where the base ndarray and the ndarray subtype
-    //        behavior differ would new custom functions need to be written for
-    //        the subtype. Finally,
-    //        after adding support for the new type, you would have a separate
-    //        ndarray subtype that could operate with other objects of the same
-    //        subtype but would not support interoperation with regular NumPy
-    //        ndarrays.  In standard Python, this interoperation is provided
-    //        through the __array_ufunc__ magic method in the ndarray subtype
-    //        class and in that case the function operates on ndarrays or their
-    //        subtypes.  This idea is extended into Numba such that
-    //        __array_ufunc__ can be present in a Numba array type object.
-    //        In this case, this function is consulted during Numba typing and
-    //        so the arguments to __array_ufunc__ are Numba types instead of
-    //        ndarray subtypes.  The array type __array_ufunc__ returns the
-    //        type of the output of the given ufunc.
-    //      */
-    //     subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__");
-    //     if (!subtype_attr) {
-    //         return typecode_ndarray(dispatcher, (PyArrayObject*)val);
-    //     }
-    // }
+    /* Array scalar handling */
+    else if (PyArray_CheckScalar(val)) {
+        return typecode_arrayscalar(dispatcher, val);
+    }
+    /* Array handling */
+    else if (tyobj == &PyArray_Type) {
+        return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+    }
+    /* Subtype of CUDA device array */
+    else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) {
+        return typecode_devicendarray(dispatcher, val);
+    }
+    /* Subtypes of Array handling */
+    else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
+        /* By default, Numba will treat all numpy.ndarray subtypes as if they
+           were the base numpy.ndarray type.  In this way, ndarray subtypes
+           can easily use all of the support that Numba has for ndarray
+           methods.
+           EXPERIMENTAL: There may be cases where a programmer would NOT want
+           ndarray subtypes to be treated exactly like the base numpy.ndarray.
+           For this purpose, a currently experimental feature allows a
+           programmer to add an attribute named
+           __numba_array_subtype_dispatch__ to their ndarray subtype.  This
+           attribute can have any value as Numba only checks for the presence
+           of the attribute and not its value.  When present, a ndarray subtype
+           will NOT be typed by Numba as a regular ndarray but this code will
+           fallthrough to the typecode_using_fingerprint call, which will
+           create a new unique Numba typecode for this ndarray subtype.  This
+           behavior has several significant effects.  First, since this
+           ndarray subtype will be treated as a different type by Numba,
+           the Numba dispatcher would then specialize on this type.  So, if
+           there was a function that had several parameters that were
+           expected to be either numpy.ndarray or a subtype of ndarray, then
+           Numba would compile a custom version of this function for each
+           combination of base and subtypes that were actually passed to the
+           function.  Second, because this subtype would now be treated as
+           a totally separate type, it will cease to function in Numba unless
+           an implementation of that type is provided to Numba through the
+           Numba type extension mechanisms (e.g., overload).  This would
+           typically start with defining a Numba type corresponding to the
+           ndarray subtype. This is the same concept as how Numba has a
+           corollary of numpy.ndarray in its type system as types.Array.
+           Next, one would typically defining boxing and unboxing routines
+           and the associated memory model.  Then, overloads for NumPy
+           functions on that type would be created.  However,
+           if the same default array memory model is used then there are tricks
+           one can do to look at Numba's internal types.Array registries and
+           to quickly apply those to the subtype as well.  In this manner,
+           only those cases where the base ndarray and the ndarray subtype
+           behavior differ would new custom functions need to be written for
+           the subtype. Finally,
+           after adding support for the new type, you would have a separate
+           ndarray subtype that could operate with other objects of the same
+           subtype but would not support interoperation with regular NumPy
+           ndarrays.  In standard Python, this interoperation is provided
+           through the __array_ufunc__ magic method in the ndarray subtype
+           class and in that case the function operates on ndarrays or their
+           subtypes.  This idea is extended into Numba such that
+           __array_ufunc__ can be present in a Numba array type object.
+           In this case, this function is consulted during Numba typing and
+           so the arguments to __array_ufunc__ are Numba types instead of
+           ndarray subtypes.  The array type __array_ufunc__ returns the
+           type of the output of the given ufunc.
+         */
+        subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__");
+        if (!subtype_attr) {
+            return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+        }
+    }
 
     return typecode_using_fingerprint(dispatcher, val);
 }
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index d02beffb0..d26304008 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -2136,3 +2136,16 @@ def _reduce_states(self):
         Compiled definitions are discarded.
         """
         return dict(py_func=self.py_func, targetoptions=self.targetoptions)
+
+
+if config.USE_LEGACY_TYPE_SYSTEM:  # Old type system
+    # Initialize typeof machinery
+    _dispatcher.typeof_init(
+        OmittedArg, dict((str(t), t._code) for t in types.number_domain)
+    )
+else:  # New type system
+    # Initialize typeof machinery
+    _dispatcher.typeof_init(
+        OmittedArg,
+        dict((str(t).split("_")[-1], t._code) for t in types.np_number_domain),
+    )

From edeed473aa2fd3d8dfa8fa3392cb6cec51654f19 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Mon, 4 Aug 2025 23:19:13 -0700
Subject: [PATCH 04/62] [Refactor][NFC] Trim CUDA _dispatcher cext

---
 .../numba/cuda/_dispatcher/_dispatcher.cpp    | 268 +-----------------
 .../numba/cuda/_dispatcher/_numba_common.h    |  43 ---
 numba_cuda/numba/cuda/_dispatcher/_typeof.cpp |   3 +-
 3 files changed, 5 insertions(+), 309 deletions(-)
 delete mode 100644 numba_cuda/numba/cuda/_dispatcher/_numba_common.h

diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
index 5afdb2880..cbe4b6d7d 100644
--- a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
+++ b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
@@ -388,8 +388,8 @@ else                                                            \
 typedef std::vector<Type> TypeTable;
 typedef std::vector<PyObject*> Functions;
 
-/* The Dispatcher class is the base class of all dispatchers in the CPU and
-   CUDA targets. Its main responsibilities are:
+/* The Dispatcher class is the base class of all dispatchers in the CUDA target.
+    Its main responsibilities are:
 
    - Resolving the best overload to call for a given set of arguments, and
    - Calling the resolved overload.
@@ -662,131 +662,9 @@ int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws)
 }
 
 
-#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11))
 
-/* A custom, fast, inlinable version of PyCFunction_Call() */
-static PyObject *
-call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals)
-{
-    PyCFunctionWithKeywords fn;
-    PyThreadState *tstate;
-
-    assert(PyCFunction_Check(cfunc));
-    assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS));
-    fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc);
-    tstate = PyThreadState_GET();
-
-#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11)
-    /*
-     * On Python 3.11, _PyEval_EvalFrameDefault stops using PyTraceInfo since
-     * it's now baked into ThreadState.
-     * https://github.com/python/cpython/pull/26623
-     */
-    if (tstate->cframe->use_tracing && tstate->c_profilefunc)
-#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10)
-    /*
-     * On Python 3.10+ trace_info comes from somewhere up in PyFrameEval et al,
-     * Numba doesn't have access to that so creates an equivalent struct and
-     * wires it up against the cframes. This is passed into the tracing
-     * functions.
-     *
-     * Code originally from:
-     * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L1611-L1622
-     */
-    PyTraceInfo trace_info;
-    trace_info.code = NULL; // not initialized
-    CFrame *prev_cframe = tstate->cframe;
-    trace_info.cframe.use_tracing = prev_cframe->use_tracing;
-    trace_info.cframe.previous = prev_cframe;
-
-    if (trace_info.cframe.use_tracing && tstate->c_profilefunc)
-#else
-    /*
-     * On Python prior to 3.10, tracing state is a member of the threadstate
-     */
-    if (tstate->use_tracing && tstate->c_profilefunc)
-#endif
-    {
-        /*
-         * The following code requires some explaining:
-         *
-         * We want the jit-compiled function to be visible to the profiler, so we
-         * need to synthesize a frame for it.
-         * The PyFrame_New() constructor doesn't do anything with the 'locals' value if the 'code's
-         * 'CO_NEWLOCALS' flag is set (which is always the case nowadays).
-         * So, to get local variables into the frame, we have to manually set the 'f_locals'
-         * member, then call `PyFrame_LocalsToFast`, where a subsequent call to the `frame.f_locals`
-         * property (by virtue of the `frame_getlocals` function in frameobject.c) will find them.
-         */
-        PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__");
-        PyObject *globals = PyDict_New();
-        PyObject *builtins = PyEval_GetBuiltins();
-        PyFrameObject *frame = NULL;
-        PyObject *result = NULL;
-#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10))
-        // Only used in 3.10, to help with saving/restoring exception state
-        PyObject *pyexc = NULL;
-        PyObject *err_type = NULL;
-        PyObject *err_value = NULL;
-        PyObject *err_traceback = NULL;
-#endif
 
-        if (!code) {
-            PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found.");
-            goto error;
-        }
-        /* Populate builtins, which is required by some JITted functions */
-        if (PyDict_SetItemString(globals, "__builtins__", builtins)) {
-            goto error;
-        }
-
-        /* unset the CO_OPTIMIZED flag, make the frame get a new locals dict */
-        code->co_flags &= 0xFFFE;
-
-        frame = PyFrame_New(tstate, code, globals, locals);
-        if (frame == NULL) {
-            goto error;
-        }
-#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11)
-        // Python 3.11 improved the frame infrastructure such that frames are
-        // updated by the virtual machine, no need to do PyFrame_LocalsToFast
-        // and PyFrame_FastToLocals to ensure `frame->f_locals` is consistent.
-        C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws), frame);
-#else
-        // Populate the 'fast locals' in `frame`
-        PyFrame_LocalsToFast(frame, 0);
-        tstate->frame = frame;
-
-        // make the call
-        C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws));
-
-        // write changes back to locals?
-        // PyFrame_FastToLocals can clear the exception indicator, therefore
-        // this state needs saving and restoring across the call if the
-        // exception indicator is set.
-        pyexc = PyErr_Occurred();
-        if (pyexc != NULL) {
-            PyErr_Fetch(&err_type, &err_value, &err_traceback);
-        }
-        PyFrame_FastToLocals(frame);
-        if (pyexc != NULL) {
-            PyErr_Restore(err_type, err_value, err_traceback);
-        }
-        tstate->frame = frame->f_back;
-#endif
-    error:
-        Py_XDECREF(frame);
-        Py_XDECREF(globals);
-        Py_XDECREF(code);
-        return result;
-    }
-    else
-    {
-        return fn(PyCFunction_GET_SELF(cfunc), args, kws);
-    }
-}
-
-#elif (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
 
 // Python 3.12 has a completely new approach to tracing and profiling due to
 // the new `sys.monitoring` system.
@@ -1124,35 +1002,7 @@ call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyO
 #endif
 
 
-static
-PyObject*
-compile_and_invoke(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals)
-{
-    /* Compile a new one */
-    PyObject *cfa, *cfunc, *retval;
-    cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args");
-    if (cfa == NULL)
-        return NULL;
-
-    /* NOTE: we call the compiled function ourselves instead of
-       letting the Python derived class do it.  This is for proper
-       behaviour of globals() in jitted functions (issue #476). */
-    cfunc = PyObject_Call(cfa, args, kws);
-    Py_DECREF(cfa);
-
-    if (cfunc == NULL)
-        return NULL;
-
-    if (PyObject_TypeCheck(cfunc, &PyCFunction_Type)) {
-        retval = call_cfunc(self, cfunc, args, kws, locals);
-    } else {
-        /* Re-enter interpreter */
-        retval = PyObject_Call(cfunc, args, kws);
-    }
-    Py_DECREF(cfunc);
 
-    return retval;
-}
 
 /* A copy of compile_and_invoke, that only compiles. This is needed for CUDA
  * kernels, because its overloads are Python instances of the _Kernel class,
@@ -1288,117 +1138,7 @@ find_named_args(Dispatcher *self, PyObject **pargs, PyObject **pkws)
     return 0;
 }
 
-static PyObject*
-Dispatcher_call(Dispatcher *self, PyObject *args, PyObject *kws)
-{
-    PyObject *tmptype, *retval = NULL;
-    int *tys = NULL;
-    int argct;
-    int i;
-    int prealloc[24];
-    int matches;
-    PyObject *cfunc;
-    PyThreadState *ts = PyThreadState_Get();
-    PyObject *locals = NULL;
-
-    /* If compilation is enabled, ensure that an exact match is found and if
-     * not compile one */
-    int exact_match_required = self->can_compile ? 1 : self->exact_match_required;
-
-#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10)
-    if (ts->tracing && ts->c_profilefunc) {
-#else
-    if (ts->use_tracing && ts->c_profilefunc) {
-#endif
-        locals = PyEval_GetLocals();
-        if (locals == NULL) {
-            goto CLEANUP;
-        }
-    }
-    if (self->fold_args) {
-        if (find_named_args(self, &args, &kws))
-            return NULL;
-    }
-    else
-        Py_INCREF(args);
-    /* Now we own a reference to args */
-
-    argct = PySequence_Fast_GET_SIZE(args);
-
-    if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int)))
-        tys = prealloc;
-    else
-        tys = new int[argct];
-
-    for (i = 0; i < argct; ++i) {
-        tmptype = PySequence_Fast_GET_ITEM(args, i);
-        tys[i] = typeof_typecode((PyObject *) self, tmptype);
-        if (tys[i] == -1) {
-            if (self->can_fallback){
-                /* We will clear the exception if fallback is allowed. */
-                PyErr_Clear();
-            } else {
-                goto CLEANUP;
-            }
-        }
-    }
-
-    /* We only allow unsafe conversions if compilation of new specializations
-       has been disabled.
-
-       Note that the number of matches is returned in matches by resolve, which
-       accepts it as a reference. */
-    cfunc = self->resolve(tys, matches, !self->can_compile,
-                          exact_match_required);
-
-    if (matches == 0 && !self->can_compile) {
-        /*
-         * If we can't compile a new specialization, look for
-         * matching signatures for which conversions haven't been
-         * registered on the C++ TypeManager.
-         */
-        int res = search_new_conversions((PyObject *) self, args, kws);
-        if (res < 0) {
-            retval = NULL;
-            goto CLEANUP;
-        }
-        if (res > 0) {
-            /* Retry with the newly registered conversions */
-            cfunc = self->resolve(tys, matches, !self->can_compile,
-                                  exact_match_required);
-        }
-    }
-    if (matches == 1) {
-        /* Definition is found */
-        retval = call_cfunc(self, cfunc, args, kws, locals);
-    } else if (matches == 0) {
-        /* No matching definition */
-        if (self->can_compile) {
-            retval = compile_and_invoke(self, args, kws, locals);
-        } else if (self->fallbackdef) {
-            /* Have object fallback */
-            retval = call_cfunc(self, self->fallbackdef, args, kws, locals);
-        } else {
-            /* Raise TypeError */
-            explain_matching_error((PyObject *) self, args, kws);
-            retval = NULL;
-        }
-    } else if (self->can_compile) {
-        /* Ambiguous, but are allowed to compile */
-        retval = compile_and_invoke(self, args, kws, locals);
-    } else {
-        /* Ambiguous */
-        explain_ambiguous((PyObject *) self, args, kws);
-        retval = NULL;
-    }
-
-CLEANUP:
-    if (tys != prealloc)
-        delete[] tys;
-    Py_DECREF(args);
 
-    return retval;
-}
 
 /* Based on Dispatcher_call above, with the following differences:
    1. It does not invoke the definition of the function.
@@ -1565,7 +1305,7 @@ static PyTypeObject DispatcherType = {
     0,                                           /* tp_as_sequence */
     0,                                           /* tp_as_mapping */
     0,                                           /* tp_hash */
-    (PyCFunctionWithKeywords)Dispatcher_call,    /* tp_call*/
+    0,                                           /* tp_call*/
     0,                                           /* tp_str*/
     0,                                           /* tp_getattro*/
     0,                                           /* tp_setattro*/
diff --git a/numba_cuda/numba/cuda/_dispatcher/_numba_common.h b/numba_cuda/numba/cuda/_dispatcher/_numba_common.h
deleted file mode 100644
index d458e4240..000000000
--- a/numba_cuda/numba/cuda/_dispatcher/_numba_common.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef NUMBA_COMMON_H_
-#define NUMBA_COMMON_H_
-
-/* __has_attribute() is a clang / gcc-5 macro */
-#ifndef __has_attribute
-#   define __has_attribute(x) 0
-#endif
-
-/* This attribute marks symbols that can be shared across C objects
- * but are not exposed outside of a shared library or executable.
- * Note this is default behaviour for global symbols under Windows.
- */
-#if defined(_MSC_VER)
-    #define VISIBILITY_HIDDEN
-    #define VISIBILITY_GLOBAL __declspec(dllexport)
-#elif (__has_attribute(visibility) || (defined(__GNUC__) && __GNUC__ >= 4))
-    #define VISIBILITY_HIDDEN __attribute__ ((visibility("hidden")))
-    #define VISIBILITY_GLOBAL __attribute__ ((visibility("default")))
-#else
-    #define VISIBILITY_HIDDEN
-    #define VISIBILITY_GLOBAL
-#endif
-
-/*
- * Numba's version of the PyArray_DescrCheck macro from NumPy, use it as a
- * direct replacement of NumPy's PyArray_DescrCheck to ensure binary
- * compatibility.
- *
- * Details of why this is needed:
- * NumPy 1.18 changed the definition of the PyArray_DescrCheck macro here:
- * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698
- * the result of this being that building against NumPy <1.18 would prevent
- * Numba running against NumPy >= 1.20 as noted here:
- * https://github.com/numba/numba/issues/6041#issuecomment-665132199
- *
- * This macro definition is copied from:
- * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698#diff-ad2213da23136c5fc5883d9eb2d88666R26
- *
- * NOTE: This is the NumPy 1.18 and above version of the macro.
- */
-#define NUMBA_PyArray_DescrCheck(op) PyObject_TypeCheck(op, &PyArrayDescr_Type)
-
-#endif /* NUMBA_COMMON_H_ */
diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
index 16c5009c4..4f03d0070 100644
--- a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
+++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
@@ -4,7 +4,6 @@
 #include <time.h>
 #include <assert.h>
 
-#include "_numba_common.h"
 #include "_typeof.h"
 #include "_hashtable.h"
 #include "_devicearray.h"
@@ -460,7 +459,7 @@ compute_fingerprint(string_writer_t *w, PyObject *val)
         PyBuffer_Release(&buf);
         return 0;
     }
-    if (NUMBA_PyArray_DescrCheck(val)) {
+    if (PyObject_TypeCheck(val, &PyArrayDescr_Type)) {
         TRY(string_writer_put_char, w, OP_NP_DTYPE);
         return compute_dtype_fingerprint(w, (PyArray_Descr *) val);
     }

From df14badf16804c01c954f92a7ba127db41abd3f4 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 5 Aug 2025 10:35:29 -0700
Subject: [PATCH 05/62] [Refactor][NFC] Remove sysmon from _dispatcher cext

---
 numba_cuda/numba/cuda/__init__.py             |   1 -
 .../numba/cuda/_dispatcher/_dispatcher.cpp    | 337 +-----------------
 numba_cuda/numba/cuda/dispatcher.py           |   1 -
 pyproject.toml                                |   1 -
 4 files changed, 3 insertions(+), 337 deletions(-)

diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index 607ef2e09..0465e1903 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -1,6 +1,5 @@
 import importlib
 from numba.core import config
-from numba.core.config import ENABLE_SYS_MONITORING
 from .utils import _readenv
 import warnings
 
diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
index cbe4b6d7d..46cb7888a 100644
--- a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
+++ b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
@@ -403,8 +403,6 @@ class Dispatcher {
     PyObject_HEAD
     /* Whether compilation of new overloads is permitted */
     char can_compile;
-    /* Enable sys.monitoring (since Python 3.12+) */
-    char enable_sysmon;
     /* Whether fallback to object mode is permitted */
     char can_fallback;
     /* Whether types must match exactly when resolving overloads.
@@ -534,7 +532,6 @@ Dispatcher_init(Dispatcher *self, PyObject *args, PyObject *kwds)
     self->tm = static_cast<TypeManager*>(tmaddr);
     self->argct = argct;
     self->can_compile = 1;
-    self->enable_sysmon = 0;  // default to turn off sys.monitoring
     self->can_fallback = can_fallback;
     self->fallbackdef = NULL;
     self->has_stararg = has_stararg;
@@ -662,344 +659,22 @@ int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws)
 }
 
 
-
-
-#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
-
-// Python 3.12 has a completely new approach to tracing and profiling due to
-// the new `sys.monitoring` system.
-
-// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L863-L868
-
-static const int8_t MOST_SIG_BIT[16] = {-1, 0, 1, 1,
-                                         2, 2, 2, 2,
-                                         3, 3, 3, 3,
-                                         3, 3, 3, 3};
-
-// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L873-L879
-
-static inline int msb(uint8_t bits) {
-    if (bits > 15) {
-        return MOST_SIG_BIT[bits>>4]+4;
-    }
-    return MOST_SIG_BIT[bits];
-}
-
-
-static int invoke_monitoring(PyThreadState * tstate, int event, Dispatcher *self, PyObject* retval)
-{
-    // This will invoke monitoring tools (if present) for the event `event`.
-    //
-    // Arguments:
-    //   tstate - the interpreter thread state
-    //   event - an event as defined in internal/pycore_instruments.h
-    //   self - the dispatcher
-    //   retval - the return value from running the dispatcher machine code (if needed)
-    //            or NULL if not needed.
-    //
-    // Return:
-    // status 0 for success -1 otherwise.
-    //
-    // Notes:
-    // Python 3.12 has a new monitoring system as described in PEP 669. It's
-    // largely implemented in CPython PR #103083.
-    //
-    // This PEP manifests as a set of monitoring instrumentation in the form of
-    // per-monitoring-tool-type callbacks stored as part of the interpreter
-    // state (can also be on the code object for "local events" but Numba
-    // doesn't support those, see the Numba developer docs). From the Python
-    // interpreter this appears as `sys.monitoring`, from the C-side there's not
-    // a great deal of public API for the sort of things that Numba wants/needs
-    // to do.
-    //
-    // The new monitoring system is event based, the general idea in the
-    // following code is to see if a monitoring "tool" has registered a callback
-    // to run on the presence of a particular event and run those callbacks if
-    // so. In Numba's case we're just about to disappear into machine code
-    // that's essentially doing the same thing as the interpreter would if it
-    // executed the bytecode present in the function that's been JIT compiled.
-    // As a result we need to tell any tool that has a callback registered for a
-    // PY_MONITORING_EVENT_PY_START that a Python function is about to start
-    // (and do something similar for when a function returns/raises).
-    // This is a total lie as the execution is in machine code, but telling this
-    // lie makes it look like a python function has started executing at the
-    // point the machine code function starts and tools like profilers will be
-    // able to identify this and do something appropriate. The "lie" is very
-    // much like lie told for Python < 3.12, but the format of the lie is
-    // different. There is no fake frame involved, it's just about calling an
-    // appropriate call back, which in a way is a lot less confusing to deal
-    // with.
-    //
-    // For reference, under cProfile all these are NULL, don't even look at
-    // them, they are legacy, you need to use the monitoring system!
-    // tstate->c_profilefunc
-    // tstate->c_profileobj
-    // tstate->c_tracefunc
-    // tstate->c_traceobj
-    //
-    // Finally: Useful places to look in the CPython code base:
-    // 1. internal/pycore_instruments.h which has the #defines for all the event
-    // types and the "types" of tools e.g. debugger, profiler.
-    // 2. Python/instrumentation.c which is where most of the implementation is
-    // done. Particularly functions `call_instrumentation_vector` and
-    // `call_one_instrument`.
-    // Note that Python/legacy_tracing.c is not somewhere to look, it's just
-    // wiring old style tracing that has been setup via e.g. C-API
-    // PyEval_SetProfile into the new monitoring system.
-    //
-    // Other things...
-    // 1. Calls to `sys.monitoring.set_events` clobber the previous state.
-    // 2. You can register callbacks for an event without having the event set.
-    // 3. You can set events and have no associated callback.
-    // 4. Tools are supposed to be respectful of other tools that are
-    //    registered, i.e. not clobber/interfere with each other.
-    // 5. There are multiple slots for tools, cProfile is a profiler and
-    //    profilers should register in slot 2 by convention.
-    //
-    // This is useful for debug:
-    // To detect whether Python is doing _any_ monitoring it's necessary to
-    // inspect the per-thread state interpreter monitors.tools member, its a
-    // uchar[15]. A non-zero value in any tools slot suggests something
-    // is registered to be called on the occurence of some event.
-    //
-    // bool monitoring_tools_present = false;
-    // for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) {
-    //     if (tstate->interp->monitors.tools[i]) {
-    //         monitoring_tools_present = true;
-    //         break;
-    //     }
-    // }
-
-    // The code in this function is based loosely on a combination of the
-    // following:
-    // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L945-L1008
-    // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L1010-L1026
-    // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L839-L861
-
-    // TODO: check this, call_instrumentation_vector has this at the top.
-    if (tstate->tracing){
-        return 0;
-    }
-
-    // Are there any tools set on this thead for this event?
-    uint8_t tools = tstate->interp->monitors.tools[event];
-    // offset value for use in callbacks
-    PyObject * offset_obj = NULL;
-    // callback args slots (used in vectorcall protocol)
-    PyObject * callback_args[3] = {NULL, NULL, NULL};
-
-    // If so...
-    if (tools)
-    {
-
-
-        PyObject *result = NULL;
-        PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__"); // incref code
-        if (!code) {
-            PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found.");
-            return -1;
-        }
-
-        // TODO: handle local events, see `get_tools_for_instruction`.
-        // The issue with local events is that they maybe don't make a lot of
-        // sense in a JIT context. The way it works is that
-        // `sys.monitoring.set_local_events` takes the code object of a function
-        // and "instruments" it with respect to the requested events. In
-        // practice this seems to materialise as swapping bytecodes associated
-        // with the event bitmask for `INSTRUMENTED_` variants of those
-        // bytecodes. Then at interpretation time if an instrumented instruction
-        // is encountered it triggers lookups in the `code->_co_monitoring`
-        // struct for tools and active monitors etc. In Numba we _know_ the
-        // bytecode at which the code starts and we can probably scrape the code
-        // to look for instrumented return instructions, so it is feasible to
-        // support at least PY_START and PY_RETURN events, however, it's a lot
-        // of effort for perhaps something that's practically not that useful.
-        // As a result, only global events are supported at present.
-
-        // This is supposed to be the offset of the
-        // currently-being-interpreted bytecode instruction. In Numba's case
-        // there is no bytecode executing. We know that for a PY_START event
-        // that the offset is probably zero (it might be 2 if there's a
-        // closure, it's whereever the `RESUME` bytecode appears). However,
-        // we don't know which bytecode will be associated with the return
-        // (without huge effort to wire that through to here). Therefore
-        // zero is also used for return/raise/unwind, the main use case,
-        // cProfile, seems to manage to do something sensible even though this
-        // is inaccurate.
-        offset_obj = PyLong_FromSsize_t(0); // incref offset_obj
-
-        // This is adapted from call_one_instrument. Note that Numba has to care
-        // about all events even though it only emits fake events for PY_START,
-        // PY_RETURN, RAISE and PY_UNWIND, this is because of the ability of
-        // `objmode` to call back into the interpreter and essentially create a
-        // continued Python execution environment/stack from there.
-        while(tools) {
-            // The tools registered are set as bits in `tools` and provide an
-            // index into monitoring_callables. This is presumably used by
-            // cPython to detect if the slot of a tool type is already in use so
-            // that a user can't register more than one tool of a given type at
-            // the same time.
-            int tool = msb(tools);
-            tools ^= (1 << tool);
-            // Get the instrument at offset `tool` for the event of interest,
-            // this is a callback function, it also might not be present! It
-            // is entirely legitimate to have events that have no callback
-            // and callbacks that have no event. This is to make it relatively
-            // easy to switch events on and off and ensure that monitoring is
-            // "lightweight".
-            PyObject * instrument = (PyObject *)tstate->interp->monitoring_callables[tool][event];
-            if (instrument == NULL){
-                continue;
-            }
-
-            // Swap the threadstate "event" for the event of interest and
-            // increment the tracing tracking field (essentially, inlined
-            // PyThreadState_EnterTracing).
-            int old_what = tstate->what_event;
-            tstate->what_event = event;
-            tstate->tracing++;
-
-            // Need to call the callback instrument. Need to know the number of
-            // arguments, this is based on whether the `retval` (return value)
-            // is NULL (it indicates whether this is a PY_START, or something
-            // like a PY_RETURN, which has 3 arguments).
-            size_t nargsf = (retval == NULL ? 2 : 3) | PY_VECTORCALL_ARGUMENTS_OFFSET;
-
-            // call the instrumentation, look at the args to the callback
-            // functions for sys.monitoring events to find out what the
-            // arguments are. e.g.
-            // PY_START has `func(code: CodeType, instruction_offset: int)`
-            // whereas
-            // PY_RETURN has `func(code: CodeType, instruction_offset: int, retval: object)`
-            // and
-            // CALL, C_RAISE, C_RETURN has `func(code: CodeType, instruction_offset: int, callable: object, arg0 object|MISSING)`
-            // i.e. the signature changes based on context. This influences the
-            // value of `nargsf` and what is wired into `callback_args`. First two
-            // arguments are always code and offset, optional third arg is
-            // the return value.
-            callback_args[0] = (PyObject*)code;
-            callback_args[1] = (PyObject*)offset_obj;
-            callback_args[2] = (PyObject*)retval;
-            PyObject ** callargs = &callback_args[0];
-
-            // finally, stage the call the the instrument
-            result = PyObject_Vectorcall(instrument, callargs, nargsf, NULL);
-
-            // decrement the tracing tracking field and set the event back to
-            // the original event (essentially, inlined
-            // PyThreadState_LeaveTracing).
-            tstate->tracing--;
-            tstate->what_event = old_what;
-
-            if (result == NULL){
-                // Error occurred in call to instrumentation.
-                Py_XDECREF(offset_obj);
-                Py_XDECREF(code);
-                return -1;
-            }
-        }
-        Py_XDECREF(offset_obj);
-        Py_XDECREF(code);
-    }
-    return 0;
-}
-
-/* invoke monitoring for PY_START if it is set */
-int static inline invoke_monitoring_PY_START(PyThreadState * tstate, Dispatcher *self) {
-    return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_START, self, NULL);
-}
-
-/* invoke monitoring for PY_RETURN if it is set */
-int static inline invoke_monitoring_PY_RETURN(PyThreadState * tstate, Dispatcher *self, PyObject * retval) {
-    return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_RETURN, self, retval);
-}
-
-/* invoke monitoring for RAISE if it is set */
-int static inline invoke_monitoring_RAISE(PyThreadState * tstate, Dispatcher *self, PyObject * exception) {
-    return invoke_monitoring(tstate, PY_MONITORING_EVENT_RAISE, self, exception);
-}
-
-/* invoke monitoring for PY_UNWIND if it is set */
-int static inline invoke_monitoring_PY_UNWIND(PyThreadState * tstate, Dispatcher *self, PyObject * exception) {
-    return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_UNWIND, self, exception);
-}
-
-/* forward declaration */
-bool static is_sysmon_enabled(Dispatcher *self);
-
 /* A custom, fast, inlinable version of PyCFunction_Call() */
 static PyObject *
 call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals)
 {
     PyCFunctionWithKeywords fn = NULL;
-    PyThreadState *tstate = NULL;
     PyObject * pyresult = NULL;
-    PyObject * pyexception = NULL;
-    const bool enabled_sysmon = is_sysmon_enabled(self);
 
     assert(PyCFunction_Check(cfunc));
     assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS));
     fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc);
-    tstate = PyThreadState_GET();
-    // issue PY_START if event is set
-    if(enabled_sysmon && invoke_monitoring_PY_START(tstate, self) != 0){
-        return NULL;
-    }
+
     // make call
     pyresult = fn(PyCFunction_GET_SELF(cfunc), args, kws);
-    if (enabled_sysmon && pyresult == NULL) {
-        // pyresult == NULL, which means the Numba function raised an exception
-        // which is now pending.
-        //
-        // NOTE: that _ALL_ exceptions trigger the RAISE event, even a
-        // StopIteration exception. To get a STOP_ITERATION event, the
-        // StopIteration exception must be "implied" i.e. a for loop exhausting
-        // a generator, whereas those coming from the executing the binary
-        // wrapped in this dispatcher must always be explicit (this is after all
-        // a function dispatcher).
-        //
-        // NOTE: That it is necessary to trigger both a `RAISE` event, as this
-        // triggered by an exception being raised, and a `PY_UNWIND` event, as
-        // this is the event for  "exiting from a python function during
-        // exception unwinding" (see CPython sys.monitoring docs).
-        //
-        // In the following, if the call to PyErr_GetRaisedException returns
-        // NULL, it means that something has cleared the error indicator and
-        // this is a most surprising state to occur (shouldn't be possible!).
-        //
-        // TODO: This makes the exception raising path a little slower as the
-        // exception state is suspended and resumed regardless of whether
-        // monitoring for such an event is set. In future it might be worth
-        // checking the tstate->interp->monitors.tools[event] and only doing the
-        // suspend/resume if something is listening for the event.
-        pyexception = PyErr_GetRaisedException();
-        if (pyexception != NULL) {
-            if(invoke_monitoring_RAISE(tstate, self, pyexception) != 0){
-                // If the monitoring callback raised, return NULL so that the
-                // exception can propagate.
-                return NULL;
-            }
-            if(invoke_monitoring_PY_UNWIND(tstate, self, pyexception) != 0){
-                // If the monitoring callback raised, return NULL so that the
-                // exception can propagate.
-                return NULL;
-            }
-            // reset the exception
-            PyErr_SetRaisedException(pyexception);
-        }
-        // Exception in Numba call as pyresult == NULL, start to unwind by
-        // returning NULL.
-        return NULL;
-    }
-    // issue PY_RETURN if event is set
-    if(enabled_sysmon && invoke_monitoring_PY_RETURN(tstate, self, pyresult) != 0){
-        return NULL;
-    }
+
     return pyresult;
 }
-#else
-#error "Python version is not supported."
-#endif
 
 
 
@@ -1285,7 +960,6 @@ static PyMethodDef Dispatcher_methods[] = {
 
 static PyMemberDef Dispatcher_members[] = {
     {(char*)"_can_compile", T_BOOL, offsetof(Dispatcher, can_compile), 0, NULL },
-    {(char*)"_enable_sysmon", T_BOOL, offsetof(Dispatcher, enable_sysmon), 0, NULL },
     {NULL}  /* Sentinel */
 };
 
@@ -1360,12 +1034,7 @@ static PyTypeObject DispatcherType = {
 };
 
 
-#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
-static
-bool is_sysmon_enabled(Dispatcher * self) {
-    return self->enable_sysmon;
-}
-#endif
+
 
 static PyObject *compute_fingerprint(PyObject *self, PyObject *args)
 {
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index d26304008..4afcbbde7 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -820,7 +820,6 @@ def __init__(
 
         self.doc = py_func.__doc__
         self._compiling_counter = CompilingCounter()
-        self._enable_sysmon = bool(config.ENABLE_SYS_MONITORING)
         weakref.finalize(self, self._make_finalizer())
 
     def _compilation_chain_init_hook(self):
diff --git a/pyproject.toml b/pyproject.toml
index e0c2da629..3fd0f65fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,6 @@ build-backend = "setuptools.build_meta"
 requires = [
     "setuptools",
     "wheel",
-    "numpy",
 ]
 
 [project]

From 0f7c9a7d2f263f44bd1c9ac95611e26872843f65 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Wed, 6 Aug 2025 09:31:31 -0700
Subject: [PATCH 06/62] [Refactor][NFC] Changes to build _dispatcher cext with
 pip install -e .

---
 numba_cuda/numba/cuda/dispatcher.py | 15 ++++-----------
 pyproject.toml                      |  1 +
 setup.py                            |  1 +
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 4afcbbde7..a88093e31 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -2137,14 +2137,7 @@ def _reduce_states(self):
         return dict(py_func=self.py_func, targetoptions=self.targetoptions)
 
 
-if config.USE_LEGACY_TYPE_SYSTEM:  # Old type system
-    # Initialize typeof machinery
-    _dispatcher.typeof_init(
-        OmittedArg, dict((str(t), t._code) for t in types.number_domain)
-    )
-else:  # New type system
-    # Initialize typeof machinery
-    _dispatcher.typeof_init(
-        OmittedArg,
-        dict((str(t).split("_")[-1], t._code) for t in types.np_number_domain),
-    )
+# Initialize typeof machinery
+_dispatcher.typeof_init(
+    OmittedArg, dict((str(t), t._code) for t in types.number_domain)
+)
diff --git a/pyproject.toml b/pyproject.toml
index 3fd0f65fc..e0c2da629 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@ build-backend = "setuptools.build_meta"
 requires = [
     "setuptools",
     "wheel",
+    "numpy",
 ]
 
 [project]
diff --git a/setup.py b/setup.py
index 31f66ae0e..c4a4b671b 100644
--- a/setup.py
+++ b/setup.py
@@ -94,6 +94,7 @@ def is_building():
         "develop",
         "easy_install",
         "test",
+        "editable_wheel",
     ]
     return any(bc in sys.argv[1:] for bc in build_commands)
 

From 21fd5068fe18ed08ad9ff0fdedf1c9aa2e95c667 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Wed, 6 Aug 2025 10:45:21 -0700
Subject: [PATCH 07/62] Add numpy build dep to conda/recipes/numba-cuda

---
 conda/recipes/numba-cuda/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index 5aecbfcb7..cba6deb08 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -25,6 +25,7 @@ requirements:
     - python
     - pip
     - setuptools
+    - numpy
   run:
     - python
     - numba >=0.59.1

From 3bcdaad669d86f4b9b31255de1ae8002070edc51 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Fri, 8 Aug 2025 11:41:02 -0700
Subject: [PATCH 08/62] [Refactor][NFC] Add _devicearray, mviewbuf cext, move
 all cexts to top level numba_cuda

---
 numba_cuda/cext/_devicearray.cpp              | 156 +++++++
 .../cuda/_dispatcher => cext}/_devicearray.h  |   0
 .../cuda/_dispatcher => cext}/_dispatcher.cpp |   7 +-
 .../cuda/_dispatcher => cext}/_hashtable.cpp  |   0
 .../cuda/_dispatcher => cext}/_hashtable.h    |   0
 .../cuda/_dispatcher => cext}/_pymodule.h     |   0
 .../cuda/_dispatcher => cext}/_typeof.cpp     |   0
 .../cuda/_dispatcher => cext}/_typeof.h       |   0
 numba_cuda/cext/mviewbuf.c                    | 382 ++++++++++++++++++
 .../cuda/_dispatcher => cext}/typeconv.cpp    |   0
 .../cuda/_dispatcher => cext}/typeconv.hpp    |   0
 numba_cuda/numba/cuda/cudadrv/devicearray.py  |   2 +-
 numba_cuda/numba/cuda/cudadrv/driver.py       |   2 +-
 numba_cuda/numba/cuda/dispatcher.py           |   2 +-
 setup.py                                      |  43 +-
 15 files changed, 577 insertions(+), 17 deletions(-)
 create mode 100644 numba_cuda/cext/_devicearray.cpp
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/_devicearray.h (100%)
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/_dispatcher.cpp (99%)
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/_hashtable.cpp (100%)
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/_hashtable.h (100%)
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/_pymodule.h (100%)
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/_typeof.cpp (100%)
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/_typeof.h (100%)
 create mode 100644 numba_cuda/cext/mviewbuf.c
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/typeconv.cpp (100%)
 rename numba_cuda/{numba/cuda/_dispatcher => cext}/typeconv.hpp (100%)

diff --git a/numba_cuda/cext/_devicearray.cpp b/numba_cuda/cext/_devicearray.cpp
new file mode 100644
index 000000000..a822c6261
--- /dev/null
+++ b/numba_cuda/cext/_devicearray.cpp
@@ -0,0 +1,156 @@
+/* This file contains the base class implementation for all device arrays. The
+ * base class is implemented in C so that computing typecodes for device arrays
+ * can be implemented efficiently. */
+
+#include "_pymodule.h"
+
+
+/* Include _devicearray., but make sure we don't get the definitions intended
+ * for consumers of the Device Array API.
+ */
+#define NUMBA_IN_DEVICEARRAY_CPP_
+#include "_devicearray.h"
+
+/* DeviceArray PyObject implementation. Note that adding more members here is
+ * presently prohibited because mapped and managed arrays derive from both
+ * DeviceArray and NumPy's ndarray, which is also a C extension class - the
+ * layout of the object cannot be resolved if this class also has members beyond
+ * PyObject_HEAD. */
+class DeviceArray {
+    PyObject_HEAD
+};
+
+/* Trivial traversal - DeviceArray instances own nothing. */
+static int
+DeviceArray_traverse(DeviceArray *self, visitproc visit, void *arg)
+{
+    return 0;
+}
+
+/* Trivial clear of all references - DeviceArray instances own nothing. */
+static int
+DeviceArray_clear(DeviceArray *self)
+{
+    return 0;
+}
+
+/* The _devicearray.DeviceArray type */
+PyTypeObject DeviceArrayType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_devicearray.DeviceArray",                  /* tp_name */
+    sizeof(DeviceArray),                         /* tp_basicsize */
+    0,                                           /* tp_itemsize */
+    0,                                           /* tp_dealloc */
+    0,                                           /* tp_vectorcall_offset */
+    0,                                           /* tp_getattr */
+    0,                                           /* tp_setattr */
+    0,                                           /* tp_as_async */
+    0,                                           /* tp_repr */
+    0,                                           /* tp_as_number */
+    0,                                           /* tp_as_sequence */
+    0,                                           /* tp_as_mapping */
+    0,                                           /* tp_hash */
+    0,                                           /* tp_call*/
+    0,                                           /* tp_str*/
+    0,                                           /* tp_getattro*/
+    0,                                           /* tp_setattro*/
+    0,                                           /* tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC,
+                                                 /* tp_flags*/
+    "DeviceArray object",                        /* tp_doc */
+    (traverseproc) DeviceArray_traverse,         /* tp_traverse */
+    (inquiry) DeviceArray_clear,                 /* tp_clear */
+    0,                                           /* tp_richcompare */
+    0,                                           /* tp_weaklistoffset */
+    0,                                           /* tp_iter */
+    0,                                           /* tp_iternext */
+    0,                                           /* tp_methods */
+    0,                                           /* tp_members */
+    0,                                           /* tp_getset */
+    0,                                           /* tp_base */
+    0,                                           /* tp_dict */
+    0,                                           /* tp_descr_get */
+    0,                                           /* tp_descr_set */
+    0,                                           /* tp_dictoffset */
+    0,                                           /* tp_init */
+    0,                                           /* tp_alloc */
+    0,                                           /* tp_new */
+    0,                                           /* tp_free */
+    0,                                           /* tp_is_gc */
+    0,                                           /* tp_bases */
+    0,                                           /* tp_mro */
+    0,                                           /* tp_cache */
+    0,                                           /* tp_subclasses */
+    0,                                           /* tp_weaklist */
+    0,                                           /* tp_del */
+    0,                                           /* tp_version_tag */
+    0,                                           /* tp_finalize */
+    0,                                           /* tp_vectorcall */
+#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
+/* This was introduced first in 3.12
+ * https://github.com/python/cpython/issues/91051
+ */
+    0,                                           /* tp_watched */
+#endif
+
+/* WARNING: Do not remove this, only modify it! It is a version guard to
+ * act as a reminder to update this struct on Python version update! */
+#if (PY_MAJOR_VERSION == 3)
+#if ! (NB_SUPPORTED_PYTHON_MINOR)
+#error "Python minor version is not supported."
+#endif
+#else
+#error "Python major version is not supported."
+#endif
+/* END WARNING*/
+};
+
+/* CUDA device array C API */
+static void *_DeviceArray_API[1] = {
+    (void*)&DeviceArrayType
+};
+
+MOD_INIT(_devicearray) {
+    PyObject *m = nullptr;
+    PyObject *d = nullptr;
+    PyObject *c_api = nullptr;
+    int error = 0;
+
+    MOD_DEF(m, "_devicearray", "No docs", NULL)
+    if (m == NULL)
+        goto error_occurred;
+
+    c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda.cext._devicearray._DEVICEARRAY_API", NULL);
+    if (c_api == NULL)
+        goto error_occurred;
+
+    DeviceArrayType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&DeviceArrayType) < 0)
+        goto error_occurred;
+
+    Py_INCREF(&DeviceArrayType);
+    error = PyModule_AddObject(m, "DeviceArray", (PyObject*)(&DeviceArrayType));
+    if (error)
+        goto error_occurred;
+
+    d = PyModule_GetDict(m);
+    if (d == NULL)
+        goto error_occurred;
+
+    error = PyDict_SetItemString(d, "_DEVICEARRAY_API", c_api);
+    /* Decref and set c_api to NULL, Py_XDECREF in error_occurred will have no
+     * effect. */
+    Py_CLEAR(c_api);
+
+    if (error)
+        goto error_occurred;
+
+    return MOD_SUCCESS_VAL(m);
+
+error_occurred:
+    Py_XDECREF(m);
+    Py_XDECREF(c_api);
+    Py_XDECREF((PyObject*)&DeviceArrayType);
+
+    return MOD_ERROR_VAL;
+}
diff --git a/numba_cuda/numba/cuda/_dispatcher/_devicearray.h b/numba_cuda/cext/_devicearray.h
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/_devicearray.h
rename to numba_cuda/cext/_devicearray.h
diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/cext/_dispatcher.cpp
similarity index 99%
rename from numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
rename to numba_cuda/cext/_dispatcher.cpp
index 46cb7888a..b2f3e1c54 100644
--- a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp
+++ b/numba_cuda/cext/_dispatcher.cpp
@@ -935,13 +935,14 @@ Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws)
 static int
 import_devicearray(void)
 {
-    PyObject *devicearray = PyImport_ImportModule("numba._devicearray");
+    // Import our vendored devicearray (following original numba's pattern)
+    PyObject *devicearray = PyImport_ImportModule("numba_cuda.cext._devicearray");
     if (devicearray == NULL) {
         return -1;
     }
     Py_DECREF(devicearray);
 
-    DeviceArray_API = (void**)PyCapsule_Import("numba._devicearray._DEVICEARRAY_API", 0);
+    DeviceArray_API = (void**)PyCapsule_Import("numba_cuda.cext._devicearray._DEVICEARRAY_API", 0);
     if (DeviceArray_API == NULL) {
         return -1;
     }
@@ -1056,7 +1057,7 @@ static PyMethodDef ext_methods[] = {
 MOD_INIT(_dispatcher) {
     if (import_devicearray() < 0) {
       PyErr_Print();
-      PyErr_SetString(PyExc_ImportError, "numba._devicearray failed to import");
+      PyErr_SetString(PyExc_ImportError, "numba_cuda.cext._devicearray failed to import");
       return MOD_ERROR_VAL;
     }
 
diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp b/numba_cuda/cext/_hashtable.cpp
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp
rename to numba_cuda/cext/_hashtable.cpp
diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.h b/numba_cuda/cext/_hashtable.h
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/_hashtable.h
rename to numba_cuda/cext/_hashtable.h
diff --git a/numba_cuda/numba/cuda/_dispatcher/_pymodule.h b/numba_cuda/cext/_pymodule.h
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/_pymodule.h
rename to numba_cuda/cext/_pymodule.h
diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/cext/_typeof.cpp
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/_typeof.cpp
rename to numba_cuda/cext/_typeof.cpp
diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.h b/numba_cuda/cext/_typeof.h
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/_typeof.h
rename to numba_cuda/cext/_typeof.h
diff --git a/numba_cuda/cext/mviewbuf.c b/numba_cuda/cext/mviewbuf.c
new file mode 100644
index 000000000..33f0888c7
--- /dev/null
+++ b/numba_cuda/cext/mviewbuf.c
@@ -0,0 +1,382 @@
+#include "_pymodule.h"
+
+static int get_writable_buffer(PyObject* obj, Py_buffer *buf, int force)
+{
+    Py_buffer read_buf;
+    int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT;
+    int ret;
+
+    /* Attempt to get a writable buffer */
+    if (!PyObject_GetBuffer(obj, buf, flags|PyBUF_WRITABLE))
+        return 0;
+    if (!force)
+        return -1;
+
+    /* Make a writable buffer from a read-only buffer */
+    PyErr_Clear();
+    if(-1 == PyObject_GetBuffer(obj, &read_buf, flags))
+        return -1;
+    ret = PyBuffer_FillInfo(buf, NULL, read_buf.buf, read_buf.len, 0,
+                             flags|PyBUF_WRITABLE);
+    PyBuffer_Release(&read_buf);
+    return ret;
+}
+
+static int get_readonly_buffer(PyObject* obj, Py_buffer *buf)
+{
+    int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT;
+
+    return PyObject_GetBuffer(obj, buf, flags);
+}
+
+
+static void free_buffer(Py_buffer * buf)
+{
+    PyBuffer_Release(buf);
+}
+
+/**
+ * Return a pointer to the data of a writable buffer from obj. If only a
+ * read-only buffer is available and force is True, a read-write buffer based on
+ * the read-only buffer is obtained. Note that this may have some surprising
+ * effects on buffers which expect the data from their read-only buffer not to
+ * be modified.
+ */
+static PyObject*
+memoryview_get_buffer(PyObject *self, PyObject *args){
+    PyObject *obj = NULL;
+    int force = 0;
+    int readonly = 0;
+    PyObject *ret = NULL;
+    Py_buffer buf;
+
+    if (!PyArg_ParseTuple(args, "O|ii", &obj, &force, &readonly))
+        return NULL;
+
+    if (readonly) {
+        if (get_readonly_buffer(obj, &buf))
+            return NULL;
+    } else {
+        if (get_writable_buffer(obj, &buf, force))
+            return NULL;
+    }
+
+    ret = PyLong_FromVoidPtr(buf.buf);
+    free_buffer(&buf);
+    return ret;
+}
+
+/**
+ * Gets a half-open range [start, end) which contains the array data
+ * Modified from numpy/core/src/multiarray/array_assign.c
+ */
+static PyObject*
+get_extents(Py_ssize_t *shape, Py_ssize_t *strides, int ndim,
+            Py_ssize_t itemsize, Py_ssize_t ptr)
+{
+    Py_ssize_t start, end;
+    int idim;
+    Py_ssize_t *dimensions = shape;
+    PyObject *ret = NULL;
+
+    if (ndim < 0 ){
+        PyErr_SetString(PyExc_ValueError, "buffer ndim < 0");
+        return NULL;
+    }
+
+    if (!dimensions) {
+        if (ndim == 0) {
+            start = end = ptr;
+            end += itemsize;
+            return Py_BuildValue("nn", start, end);
+        }
+        PyErr_SetString(PyExc_ValueError, "buffer shape is not defined");
+        return NULL;
+    }
+
+    if (!strides) {
+        PyErr_SetString(PyExc_ValueError, "buffer strides is not defined");
+        return NULL;
+    }
+
+    /* Calculate with a closed range [start, end] */
+    start = end = ptr;
+    for (idim = 0; idim < ndim; ++idim) {
+        Py_ssize_t stride = strides[idim], dim = dimensions[idim];
+        /* If the array size is zero, return an empty range */
+        if (dim == 0) {
+            start = end = ptr;
+            ret = Py_BuildValue("nn", start, end);
+            break;
+        }
+        /* Expand either upwards or downwards depending on stride */
+        else {
+            if (stride > 0) {
+                end += stride * (dim - 1);
+            }
+            else if (stride < 0) {
+                start += stride * (dim - 1);
+            }
+        }
+    }
+
+    if (!ret) {
+        /* Return a half-open range */
+        Py_ssize_t out_start = start;
+        Py_ssize_t out_end = end + itemsize;
+
+        ret = Py_BuildValue("nn", out_start, out_end);
+    }
+
+    return ret;
+}
+
+static PyObject*
+memoryview_get_extents(PyObject *self, PyObject *args)
+{
+    PyObject *obj = NULL;
+    PyObject *ret = NULL;
+    Py_buffer b;
+    if (!PyArg_ParseTuple(args, "O", &obj))
+        return NULL;
+
+    if (get_readonly_buffer(obj, &b))
+        return NULL;
+
+    ret = get_extents(b.shape, b.strides, b.ndim, b.itemsize,
+                      (Py_ssize_t)b.buf);
+    free_buffer(&b);
+    return ret;
+}
+
+static PyObject*
+memoryview_get_extents_info(PyObject *self, PyObject *args)
+{
+    int i;
+    Py_ssize_t *shape_ary = NULL;
+    Py_ssize_t *strides_ary = NULL;
+    PyObject *shape_tuple = NULL;
+    PyObject *strides_tuple = NULL;
+    PyObject *shape = NULL, *strides = NULL;
+    Py_ssize_t itemsize = 0;
+    int ndim = 0;
+    PyObject* res = NULL;
+
+    if (!PyArg_ParseTuple(args, "OOin", &shape, &strides, &ndim, &itemsize))
+        goto cleanup;
+
+    if (ndim < 0) {
+        PyErr_SetString(PyExc_ValueError, "ndim is negative");
+        goto cleanup;
+    }
+
+    if (itemsize <= 0) {
+        PyErr_SetString(PyExc_ValueError, "ndim <= 0");
+        goto cleanup;
+    }
+
+    shape_ary = malloc(sizeof(Py_ssize_t) * ndim + 1);
+    strides_ary = malloc(sizeof(Py_ssize_t) * ndim + 1);
+
+    shape_tuple = PySequence_Fast(shape, "shape is not a sequence");
+    if (!shape_tuple) goto cleanup;
+
+    for (i = 0; i < ndim; ++i) {
+        shape_ary[i] = PyNumber_AsSsize_t(
+                           PySequence_Fast_GET_ITEM(shape_tuple, i),
+                           PyExc_OverflowError);
+    }
+
+    strides_tuple = PySequence_Fast(strides, "strides is not a sequence");
+    if (!strides_tuple) goto cleanup;
+
+    for (i = 0; i < ndim; ++i) {
+        strides_ary[i] = PyNumber_AsSsize_t(
+                           PySequence_Fast_GET_ITEM(strides_tuple, i),
+                           PyExc_OverflowError);
+    }
+
+    res = get_extents(shape_ary, strides_ary, ndim, itemsize, 0);
+cleanup:
+    free(shape_ary);
+    free(strides_ary);
+    Py_XDECREF(shape_tuple);
+    Py_XDECREF(strides_tuple);
+    return res;
+}
+
+
+/* new type to expose buffer interface */
+typedef struct {
+    PyObject_HEAD
+    /* Type-specific fields go here. */
+} MemAllocObject;
+
+
+static int
+get_bufinfo(PyObject *self, Py_ssize_t *psize, void **pptr)
+{
+    PyObject *buflen = NULL;
+    PyObject *bufptr = NULL;
+    Py_ssize_t size = 0;
+    void* ptr = NULL;
+    int ret = -1;
+
+    buflen = PyObject_GetAttrString(self, "_buflen_");
+    if (!buflen) goto cleanup;
+
+    bufptr = PyObject_GetAttrString(self, "_bufptr_");
+    if (!bufptr) goto cleanup;
+
+    size = PyNumber_AsSsize_t(buflen, PyExc_OverflowError);
+    if (size == -1 && PyErr_Occurred()) goto cleanup;
+    else if (size < 0) {
+        PyErr_SetString(PyExc_ValueError, "negative buffer size");
+        goto cleanup;
+    }
+
+    ptr = PyLong_AsVoidPtr(PyNumber_Long(bufptr));
+    if (PyErr_Occurred())
+        goto cleanup;
+    else if (!ptr) {
+        PyErr_SetString(PyExc_ValueError, "null buffer pointer");
+        goto cleanup;
+    }
+
+    *psize = size;
+    *pptr = ptr;
+    ret = 0;
+cleanup:
+    Py_XDECREF(buflen);
+    Py_XDECREF(bufptr);
+    return ret;
+}
+
+
+static int
+MemAllocObject_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    Py_ssize_t size = 0;
+    void *ptr = 0;
+    int readonly;
+
+    if(-1 == get_bufinfo(self, &size, &ptr))
+        return -1;
+
+    readonly = (PyBUF_WRITABLE & flags) != PyBUF_WRITABLE;
+
+    /* fill buffer */
+    if (-1 == PyBuffer_FillInfo(view, self, (void*)ptr, size, readonly, flags))
+        return -1;
+
+    return 0;
+}
+
+static void
+MemAllocObject_releasebuffer(PyObject *self, Py_buffer *view)
+{
+    /* Do nothing */
+}
+
+static PyBufferProcs MemAlloc_as_buffer = {
+    MemAllocObject_getbuffer,
+    MemAllocObject_releasebuffer,
+};
+
+
+static PyTypeObject MemAllocType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "mviewbuf.MemAlloc",                        /* tp_name */
+    sizeof(MemAllocObject),                     /* tp_basicsize */
+    0,                                          /* tp_itemsize */
+    0,                                          /* tp_dealloc */
+    0,                                          /* tp_vectorcall_offset */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+    0,                                          /* tp_as_async */
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    &MemAlloc_as_buffer,                        /* tp_as_buffer */
+    (Py_TPFLAGS_DEFAULT| Py_TPFLAGS_BASETYPE),  /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+    0,                                          /* tp_finalize */
+    0,                                          /* tp_vectorcall */
+#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
+/* This was introduced first in 3.12
+ * https://github.com/python/cpython/issues/91051
+ */
+    0,                                           /* tp_watched */
+#endif
+
+/* WARNING: Do not remove this, only modify it! It is a version guard to
+ * act as a reminder to update this struct on Python version update! */
+#if (PY_MAJOR_VERSION == 3)
+#if ! (NB_SUPPORTED_PYTHON_MINOR)
+#error "Python minor version is not supported."
+#endif
+#else
+#error "Python major version is not supported."
+#endif
+/* END WARNING*/
+};
+
+
+static PyMethodDef core_methods[] = {
+#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL }
+    declmethod(memoryview_get_buffer),
+    declmethod(memoryview_get_extents),
+    declmethod(memoryview_get_extents_info),
+    { NULL },
+#undef declmethod
+};
+
+
+MOD_INIT(mviewbuf) {
+    PyObject *module;
+    MOD_DEF(module, "mviewbuf", "No docs", core_methods)
+    if (module == NULL)
+        return MOD_ERROR_VAL;
+
+    MemAllocType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&MemAllocType) < 0){
+        return MOD_ERROR_VAL;
+    }
+
+    Py_INCREF(&MemAllocType);
+    PyModule_AddObject(module, "MemAlloc", (PyObject*)&MemAllocType);
+
+    return MOD_SUCCESS_VAL(module);
+}
diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp b/numba_cuda/cext/typeconv.cpp
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/typeconv.cpp
rename to numba_cuda/cext/typeconv.cpp
diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp b/numba_cuda/cext/typeconv.hpp
similarity index 100%
rename from numba_cuda/numba/cuda/_dispatcher/typeconv.hpp
rename to numba_cuda/cext/typeconv.hpp
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index 819d8a6e4..0a6b3cb88 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 import numba
-from numba import _devicearray
+from numba_cuda.cext import _devicearray
 from numba.cuda.cudadrv import devices, dummyarray
 from numba.cuda.cudadrv import driver as _driver
 from numba.core import types, config
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 7470cb8b5..fdce25eb5 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -43,7 +43,7 @@
 from collections import namedtuple, deque
 
 
-from numba import mviewbuf
+from numba_cuda.cext import mviewbuf
 from numba.core import config
 from numba.cuda import utils, serialize
 from .error import CudaSupportError, CudaDriverError
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index a88093e31..8d38cf333 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -38,7 +38,7 @@
 from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
 
 from numba import cuda
-from numba.cuda import _dispatcher
+from numba_cuda.cext import _dispatcher
 
 from warnings import warn
 
diff --git a/setup.py b/setup.py
index c4a4b671b..7ecc40714 100644
--- a/setup.py
+++ b/setup.py
@@ -37,28 +37,49 @@ def get_ext_modules():
             "m",
         ]
 
+    ext_devicearray = Extension(
+        name="numba_cuda.cext._devicearray",
+        sources=["numba_cuda/cext/_devicearray.cpp"],
+        depends=[
+            "numba_cuda/cext/_pymodule.h",
+            "numba_cuda/cext/_devicearray.h",
+        ],
+        include_dirs=["numba_cuda/cext"],
+        extra_compile_args=["-std=c++11"],
+    )
+
+    install_name_tool_fixer = []
+    if sys.platform == "darwin":
+        install_name_tool_fixer = ["-headerpad_max_install_names"]
+
+    ext_mviewbuf = Extension(
+        name="numba_cuda.cext.mviewbuf",
+        extra_link_args=install_name_tool_fixer,
+        sources=["numba_cuda/cext/mviewbuf.c"],
+    )
+
     dispatcher_sources = [
-        "numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp",
-        "numba_cuda/numba/cuda/_dispatcher/_typeof.cpp",
-        "numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp",
-        "numba_cuda/numba/cuda/_dispatcher/typeconv.cpp",
+        "numba_cuda/cext/_dispatcher.cpp",
+        "numba_cuda/cext/_typeof.cpp",
+        "numba_cuda/cext/_hashtable.cpp",
+        "numba_cuda/cext/typeconv.cpp",
     ]
     ext_dispatcher = Extension(
-        name="numba_cuda.numba.cuda._dispatcher",
+        name="numba_cuda.cext._dispatcher",
         sources=dispatcher_sources,
         depends=[
-            "numba_cuda/numba/cuda/_dispatcher/_pymodule.h",
-            "numba_cuda/numba/cuda/_dispatcher/_typeof.h",
-            "numba_cuda/numba/cuda/_dispatcher/_hashtable.h",
+            "numba_cuda/cext/_pymodule.h",
+            "numba_cuda/cext/_typeof.h",
+            "numba_cuda/cext/_hashtable.h",
         ],
         extra_compile_args=["-std=c++11"],
         **np_compile_args,
     )
 
-    # Add our include directory to the existing include_dirs
-    ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/_dispatcher")
+    # Append our cext dir to include_dirs
+    ext_dispatcher.include_dirs.append("numba_cuda/cext")
 
-    return [ext_dispatcher]
+    return [ext_dispatcher, ext_mviewbuf, ext_devicearray]
 
 
 def is_building():

From 5c80d566cfd156323bc345a2109fc506ea1693e1 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 12 Aug 2025 10:25:34 -0700
Subject: [PATCH 09/62] Update CI numpy version reqs and build matrix

---
 ci/matrix.yml                      | 8 ++++++++
 conda/recipes/numba-cuda/meta.yaml | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/matrix.yml b/ci/matrix.yml
index 47019cc16..2e771b820 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -1,7 +1,15 @@
 build-matrix:
+  - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 simulator-matrix:
+  - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 test-matrix:
   - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest' }
   - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4',   DRIVER: 'latest' }
diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index cba6deb08..6beeac6a7 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -25,7 +25,8 @@ requirements:
     - python
     - pip
     - setuptools
-    - numpy
+    - numpy >= 2.1.0  # [py >= 310]
+    - numpy == 1.26.0  # [py == 39]
   run:
     - python
     - numba >=0.59.1

From 22a41f89a6501dc089fca39d00a04217f7239f14 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 12 Aug 2025 10:28:29 -0700
Subject: [PATCH 10/62] Fix numpy ver requirements

---
 conda/recipes/numba-cuda/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index 6beeac6a7..7a26d6ad0 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -25,8 +25,8 @@ requirements:
     - python
     - pip
     - setuptools
-    - numpy >= 2.1.0  # [py >= 310]
-    - numpy == 1.26.0  # [py == 39]
+    - numpy >=2.1.0  # [py >= 310]
+    - numpy ==1.26.0  # [py == 39]
   run:
     - python
     - numba >=0.59.1

From 47e5b1851ab33543d4e707840ef6021d5b3e482f Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 12 Aug 2025 10:34:57 -0700
Subject: [PATCH 11/62] Remove additional configs in build-matrix and
 simulator-matrix

---
 ci/matrix.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/ci/matrix.yml b/ci/matrix.yml
index 2e771b820..47019cc16 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -1,15 +1,7 @@
 build-matrix:
-  - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 simulator-matrix:
-  - { ARCH: 'amd64', PY_VER: '3.9',  CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 test-matrix:
   - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest' }
   - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4',   DRIVER: 'latest' }

From 52da47eeaa4c3ef4b583e4163fc0bbc460101ccb Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 12 Aug 2025 11:02:35 -0700
Subject: [PATCH 12/62] Fix setup.py build_commands list

---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7ecc40714..448547e0b 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,8 @@ def is_building():
         "build",
         "build_py",
         "build_ext",
-        "build_clibbuild_scripts",
+        "build_clib",
+        "build_scripts",
         "install",
         "install_lib",
         "install_headers",
@@ -109,7 +110,7 @@ def is_building():
         "bdist_rpm",
         "bdist_wininst",
         "check",
-        "build_doc",
+        "build_docs",
         "bdist_wheel",
         "bdist_egg",
         "develop",

From 366b3d74720508285f689201de03b2656e43ed58 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 19 Aug 2025 21:13:39 +0100
Subject: [PATCH 13/62] Make numba_cuda.cext a package

---
 numba_cuda/cext/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 numba_cuda/cext/__init__.py

diff --git a/numba_cuda/cext/__init__.py b/numba_cuda/cext/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 487e872a943ceaa69a6f7b7bcbd969068783aecd Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 19 Aug 2025 21:50:52 +0100
Subject: [PATCH 14/62] Modify build matrix for each Python version

This is not sufficient. We also need:

- One build for each Python version for both of amd64 and aarch64
- The docs build to be changed to download the py313-amd64 wheels
- Maybe other things I missed?
---
 .github/workflows/conda-python-build.yaml | 2 +-
 .github/workflows/conda-python-tests.yaml | 2 +-
 .github/workflows/wheels-build.yaml       | 4 ++--
 .github/workflows/wheels-test.yaml        | 2 +-
 ci/matrix.yml                             | 4 ++++
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml
index bec7bae68..f71890e91 100644
--- a/.github/workflows/conda-python-build.yaml
+++ b/.github/workflows/conda-python-build.yaml
@@ -101,7 +101,7 @@ jobs:
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: conda-repo
+          name: conda-repo-py${{ matrix.PY_VER }}
           path: "/tmp/conda-bld-output"
       - name: Publish conda package
         if: inputs.upload_to_anaconda
diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml
index 3623fd40d..3f7552ede 100644
--- a/.github/workflows/conda-python-tests.yaml
+++ b/.github/workflows/conda-python-tests.yaml
@@ -93,7 +93,7 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo
+        name: conda-repo-py${{ matrix.PY_VER }}
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml
index 2c86cc1b4..15e39fb56 100644
--- a/.github/workflows/wheels-build.yaml
+++ b/.github/workflows/wheels-build.yaml
@@ -149,11 +149,11 @@ jobs:
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: sdist
+          name: sdist-py${{ matrix.PY_VER }}
           path: ${{ env.sdist_path }}
       - name: Upload wheel
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: wheel
+          name: wheel-py${{ matrix.PY_VER }}
           path: ${{ env.wheel_path }}
diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml
index 3c6d294d3..fe354ff77 100644
--- a/.github/workflows/wheels-test.yaml
+++ b/.github/workflows/wheels-test.yaml
@@ -111,7 +111,7 @@ jobs:
         fetch-depth: 0 # unshallow fetch for setuptools-scm
         persist-credentials: false
     - uses: actions/download-artifact@v4
-      name: wheel
+      name: wheel-py${{ matrix.PY_VER }}
     - name: Display structure of downloaded files
       run: ls -R
     - name: Standardize repository information
diff --git a/ci/matrix.yml b/ci/matrix.yml
index 47019cc16..611930905 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -1,4 +1,8 @@
 build-matrix:
+  - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 simulator-matrix:
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }

From 42bea6ab6530ea1954fae5066f187beed93f81b7 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 26 Aug 2025 11:56:07 -0700
Subject: [PATCH 15/62] Move cext to numba_cuda.numba.cuda and enable build
 matrix for arm64

---
 .github/workflows/conda-python-build.yaml     |  2 +-
 .github/workflows/conda-python-tests.yaml     |  2 +-
 .github/workflows/wheels-build.yaml           |  4 +--
 .github/workflows/wheels-test.yaml            |  2 +-
 ci/matrix.yml                                 |  5 +++
 numba_cuda/{ => numba/cuda}/cext/__init__.py  |  0
 .../{ => numba/cuda}/cext/_devicearray.cpp    |  2 +-
 .../{ => numba/cuda}/cext/_devicearray.h      |  0
 .../{ => numba/cuda}/cext/_dispatcher.cpp     |  7 ++--
 .../{ => numba/cuda}/cext/_hashtable.cpp      |  0
 numba_cuda/{ => numba/cuda}/cext/_hashtable.h |  0
 numba_cuda/{ => numba/cuda}/cext/_pymodule.h  |  0
 numba_cuda/{ => numba/cuda}/cext/_typeof.cpp  |  0
 numba_cuda/{ => numba/cuda}/cext/_typeof.h    |  0
 numba_cuda/{ => numba/cuda}/cext/mviewbuf.c   |  0
 numba_cuda/{ => numba/cuda}/cext/typeconv.cpp |  0
 numba_cuda/{ => numba/cuda}/cext/typeconv.hpp |  0
 numba_cuda/numba/cuda/cudadrv/devicearray.py  |  2 +-
 numba_cuda/numba/cuda/cudadrv/driver.py       |  2 +-
 numba_cuda/numba/cuda/dispatcher.py           |  2 +-
 setup.py                                      | 32 +++++++++----------
 21 files changed, 33 insertions(+), 29 deletions(-)
 rename numba_cuda/{ => numba/cuda}/cext/__init__.py (100%)
 rename numba_cuda/{ => numba/cuda}/cext/_devicearray.cpp (99%)
 rename numba_cuda/{ => numba/cuda}/cext/_devicearray.h (100%)
 rename numba_cuda/{ => numba/cuda}/cext/_dispatcher.cpp (99%)
 rename numba_cuda/{ => numba/cuda}/cext/_hashtable.cpp (100%)
 rename numba_cuda/{ => numba/cuda}/cext/_hashtable.h (100%)
 rename numba_cuda/{ => numba/cuda}/cext/_pymodule.h (100%)
 rename numba_cuda/{ => numba/cuda}/cext/_typeof.cpp (100%)
 rename numba_cuda/{ => numba/cuda}/cext/_typeof.h (100%)
 rename numba_cuda/{ => numba/cuda}/cext/mviewbuf.c (100%)
 rename numba_cuda/{ => numba/cuda}/cext/typeconv.cpp (100%)
 rename numba_cuda/{ => numba/cuda}/cext/typeconv.hpp (100%)

diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml
index f71890e91..84e6a6951 100644
--- a/.github/workflows/conda-python-build.yaml
+++ b/.github/workflows/conda-python-build.yaml
@@ -101,7 +101,7 @@ jobs:
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: conda-repo-py${{ matrix.PY_VER }}
+          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
           path: "/tmp/conda-bld-output"
       - name: Publish conda package
         if: inputs.upload_to_anaconda
diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml
index 3f7552ede..02ead9985 100644
--- a/.github/workflows/conda-python-tests.yaml
+++ b/.github/workflows/conda-python-tests.yaml
@@ -93,7 +93,7 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo-py${{ matrix.PY_VER }}
+        name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml
index 15e39fb56..c876bfb16 100644
--- a/.github/workflows/wheels-build.yaml
+++ b/.github/workflows/wheels-build.yaml
@@ -149,11 +149,11 @@ jobs:
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: sdist-py${{ matrix.PY_VER }}
+          name: sdist-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
           path: ${{ env.sdist_path }}
       - name: Upload wheel
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: wheel-py${{ matrix.PY_VER }}
+          name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
           path: ${{ env.wheel_path }}
diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml
index fe354ff77..f45b5f1fd 100644
--- a/.github/workflows/wheels-test.yaml
+++ b/.github/workflows/wheels-test.yaml
@@ -111,7 +111,7 @@ jobs:
         fetch-depth: 0 # unshallow fetch for setuptools-scm
         persist-credentials: false
     - uses: actions/download-artifact@v4
-      name: wheel-py${{ matrix.PY_VER }}
+      name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
     - name: Display structure of downloaded files
       run: ls -R
     - name: Standardize repository information
diff --git a/ci/matrix.yml b/ci/matrix.yml
index 611930905..d8da7b351 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -4,6 +4,11 @@ build-matrix:
   - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 simulator-matrix:
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 test-matrix:
diff --git a/numba_cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py
similarity index 100%
rename from numba_cuda/cext/__init__.py
rename to numba_cuda/numba/cuda/cext/__init__.py
diff --git a/numba_cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp
similarity index 99%
rename from numba_cuda/cext/_devicearray.cpp
rename to numba_cuda/numba/cuda/cext/_devicearray.cpp
index a822c6261..645aafdce 100644
--- a/numba_cuda/cext/_devicearray.cpp
+++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp
@@ -120,7 +120,7 @@ MOD_INIT(_devicearray) {
     if (m == NULL)
         goto error_occurred;
 
-    c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda.cext._devicearray._DEVICEARRAY_API", NULL);
+    c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda._devicearray._DEVICEARRAY_API", NULL);
     if (c_api == NULL)
         goto error_occurred;
 
diff --git a/numba_cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h
similarity index 100%
rename from numba_cuda/cext/_devicearray.h
rename to numba_cuda/numba/cuda/cext/_devicearray.h
diff --git a/numba_cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
similarity index 99%
rename from numba_cuda/cext/_dispatcher.cpp
rename to numba_cuda/numba/cuda/cext/_dispatcher.cpp
index b2f3e1c54..2e8e1a882 100644
--- a/numba_cuda/cext/_dispatcher.cpp
+++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
@@ -935,14 +935,13 @@ Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws)
 static int
 import_devicearray(void)
 {
-    // Import our vendored devicearray (following original numba's pattern)
-    PyObject *devicearray = PyImport_ImportModule("numba_cuda.cext._devicearray");
+    PyObject *devicearray = PyImport_ImportModule("numba_cuda._devicearray");
     if (devicearray == NULL) {
         return -1;
     }
     Py_DECREF(devicearray);
 
-    DeviceArray_API = (void**)PyCapsule_Import("numba_cuda.cext._devicearray._DEVICEARRAY_API", 0);
+    DeviceArray_API = (void**)PyCapsule_Import("numba_cuda._devicearray._DEVICEARRAY_API", 0);
     if (DeviceArray_API == NULL) {
         return -1;
     }
@@ -1057,7 +1056,7 @@ static PyMethodDef ext_methods[] = {
 MOD_INIT(_dispatcher) {
     if (import_devicearray() < 0) {
       PyErr_Print();
-      PyErr_SetString(PyExc_ImportError, "numba_cuda.cext._devicearray failed to import");
+      PyErr_SetString(PyExc_ImportError, "numba_cuda._devicearray failed to import");
       return MOD_ERROR_VAL;
     }
 
diff --git a/numba_cuda/cext/_hashtable.cpp b/numba_cuda/numba/cuda/cext/_hashtable.cpp
similarity index 100%
rename from numba_cuda/cext/_hashtable.cpp
rename to numba_cuda/numba/cuda/cext/_hashtable.cpp
diff --git a/numba_cuda/cext/_hashtable.h b/numba_cuda/numba/cuda/cext/_hashtable.h
similarity index 100%
rename from numba_cuda/cext/_hashtable.h
rename to numba_cuda/numba/cuda/cext/_hashtable.h
diff --git a/numba_cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h
similarity index 100%
rename from numba_cuda/cext/_pymodule.h
rename to numba_cuda/numba/cuda/cext/_pymodule.h
diff --git a/numba_cuda/cext/_typeof.cpp b/numba_cuda/numba/cuda/cext/_typeof.cpp
similarity index 100%
rename from numba_cuda/cext/_typeof.cpp
rename to numba_cuda/numba/cuda/cext/_typeof.cpp
diff --git a/numba_cuda/cext/_typeof.h b/numba_cuda/numba/cuda/cext/_typeof.h
similarity index 100%
rename from numba_cuda/cext/_typeof.h
rename to numba_cuda/numba/cuda/cext/_typeof.h
diff --git a/numba_cuda/cext/mviewbuf.c b/numba_cuda/numba/cuda/cext/mviewbuf.c
similarity index 100%
rename from numba_cuda/cext/mviewbuf.c
rename to numba_cuda/numba/cuda/cext/mviewbuf.c
diff --git a/numba_cuda/cext/typeconv.cpp b/numba_cuda/numba/cuda/cext/typeconv.cpp
similarity index 100%
rename from numba_cuda/cext/typeconv.cpp
rename to numba_cuda/numba/cuda/cext/typeconv.cpp
diff --git a/numba_cuda/cext/typeconv.hpp b/numba_cuda/numba/cuda/cext/typeconv.hpp
similarity index 100%
rename from numba_cuda/cext/typeconv.hpp
rename to numba_cuda/numba/cuda/cext/typeconv.hpp
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index 0a6b3cb88..31d50abe2 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 import numba
-from numba_cuda.cext import _devicearray
+from numba_cuda import _devicearray
 from numba.cuda.cudadrv import devices, dummyarray
 from numba.cuda.cudadrv import driver as _driver
 from numba.core import types, config
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index fdce25eb5..f88fa6712 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -43,7 +43,7 @@
 from collections import namedtuple, deque
 
 
-from numba_cuda.cext import mviewbuf
+from numba.cuda import mviewbuf
 from numba.core import config
 from numba.cuda import utils, serialize
 from .error import CudaSupportError, CudaDriverError
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 8d38cf333..a88093e31 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -38,7 +38,7 @@
 from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
 
 from numba import cuda
-from numba_cuda.cext import _dispatcher
+from numba.cuda import _dispatcher
 
 from warnings import warn
 
diff --git a/setup.py b/setup.py
index 448547e0b..a0a637bc2 100644
--- a/setup.py
+++ b/setup.py
@@ -38,13 +38,13 @@ def get_ext_modules():
         ]
 
     ext_devicearray = Extension(
-        name="numba_cuda.cext._devicearray",
-        sources=["numba_cuda/cext/_devicearray.cpp"],
+        name="numba_cuda._devicearray",
+        sources=["numba_cuda/numba/cuda/cext/_devicearray.cpp"],
         depends=[
-            "numba_cuda/cext/_pymodule.h",
-            "numba_cuda/cext/_devicearray.h",
+            "numba_cuda/numba/cuda/cext/_pymodule.h",
+            "numba_cuda/numba/cuda/cext/_devicearray.h",
         ],
-        include_dirs=["numba_cuda/cext"],
+        include_dirs=["numba_cuda/numba/cuda/cext"],
         extra_compile_args=["-std=c++11"],
     )
 
@@ -53,31 +53,31 @@ def get_ext_modules():
         install_name_tool_fixer = ["-headerpad_max_install_names"]
 
     ext_mviewbuf = Extension(
-        name="numba_cuda.cext.mviewbuf",
+        name="numba_cuda.numba.cuda.mviewbuf",
         extra_link_args=install_name_tool_fixer,
-        sources=["numba_cuda/cext/mviewbuf.c"],
+        sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"],
     )
 
     dispatcher_sources = [
-        "numba_cuda/cext/_dispatcher.cpp",
-        "numba_cuda/cext/_typeof.cpp",
-        "numba_cuda/cext/_hashtable.cpp",
-        "numba_cuda/cext/typeconv.cpp",
+        "numba_cuda/numba/cuda/cext/_dispatcher.cpp",
+        "numba_cuda/numba/cuda/cext/_typeof.cpp",
+        "numba_cuda/numba/cuda/cext/_hashtable.cpp",
+        "numba_cuda/numba/cuda/cext/typeconv.cpp",
     ]
     ext_dispatcher = Extension(
-        name="numba_cuda.cext._dispatcher",
+        name="numba_cuda.numba.cuda._dispatcher",
         sources=dispatcher_sources,
         depends=[
-            "numba_cuda/cext/_pymodule.h",
-            "numba_cuda/cext/_typeof.h",
-            "numba_cuda/cext/_hashtable.h",
+            "numba_cuda/numba/cuda/cext/_pymodule.h",
+            "numba_cuda/numba/cuda/cext/_typeof.h",
+            "numba_cuda/numba/cuda/cext/_hashtable.h",
         ],
         extra_compile_args=["-std=c++11"],
         **np_compile_args,
     )
 
     # Append our cext dir to include_dirs
-    ext_dispatcher.include_dirs.append("numba_cuda/cext")
+    ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/cext")
 
     return [ext_dispatcher, ext_mviewbuf, ext_devicearray]
 

From 7cfe7c3f5253751d29f48b402e9aea41fe10c112 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 26 Aug 2025 12:02:49 -0700
Subject: [PATCH 16/62] Make py3.9 arm64 and amd64 builds use cuda_ver 11.8.0

---
 ci/matrix.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/matrix.yml b/ci/matrix.yml
index d8da7b351..247027305 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -1,10 +1,10 @@
 build-matrix:
-  - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }

From 88c0b2cda064d46eb2a05c29e2bcc01a1e2edb66 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 26 Aug 2025 12:19:23 -0700
Subject: [PATCH 17/62] Make py3.9 a supported python version for cext.

---
 numba_cuda/numba/cuda/cext/_pymodule.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h
index c261314f5..47eedcb8a 100644
--- a/numba_cuda/numba/cuda/cext/_pymodule.h
+++ b/numba_cuda/numba/cuda/cext/_pymodule.h
@@ -30,6 +30,6 @@
         Py_DECREF(tmp); } while (0)
 
 
-#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 9) || (PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
 
 #endif /* NUMBA_PY_MODULE_H_ */

From 970e5bc51bb9a205ddf6f65e25c1f2a5aac86140 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 26 Aug 2025 12:29:21 -0700
Subject: [PATCH 18/62] Make CI build-docs and simulator  workflows point to
 correct build

---
 .github/workflows/docs-build.yaml     | 2 +-
 .github/workflows/simulator-test.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml
index ac9b8f244..d1b6a23d0 100644
--- a/.github/workflows/docs-build.yaml
+++ b/.github/workflows/docs-build.yaml
@@ -56,7 +56,7 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo
+        name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml
index 015efe8bb..625a69882 100644
--- a/.github/workflows/simulator-test.yaml
+++ b/.github/workflows/simulator-test.yaml
@@ -83,7 +83,7 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo
+        name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information

From afaea07b6b20f720a59ce0b388b869959605728f Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Tue, 26 Aug 2025 13:42:54 -0700
Subject: [PATCH 19/62] Make CI scripts point to the right conda-repo-py dirs

---
 ci/build_docs.sh                | 13 ++++++++++++-
 ci/test_conda.sh                | 13 ++++++++++++-
 ci/test_conda_ctypes_binding.sh | 13 ++++++++++++-
 ci/test_simulator.sh            | 13 ++++++++++++-
 4 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 74816c6f2..70d27619b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -20,7 +20,18 @@ set +u
 conda activate docs
 set -u
 
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+# Detect system architecture to set conda repo path
+ARCH=$(uname -m)
+if [[ "$ARCH" == "x86_64" ]]; then
+    ARCH_SUFFIX="amd64"
+elif [[ "$ARCH" == "aarch64" ]]; then
+    ARCH_SUFFIX="arm64"
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
 
 pip install nvidia-sphinx-theme
 
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 70f4dc124..07ea5dbf5 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -38,7 +38,18 @@ set -u
 
 pip install filecheck
 
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+# Detect system architecture to set conda repo path
+ARCH=$(uname -m)
+if [[ "$ARCH" == "x86_64" ]]; then
+    ARCH_SUFFIX="amd64"
+elif [[ "$ARCH" == "aarch64" ]]; then
+    ARCH_SUFFIX="arm64"
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh
index a274c021e..7aa11aab3 100755
--- a/ci/test_conda_ctypes_binding.sh
+++ b/ci/test_conda_ctypes_binding.sh
@@ -33,7 +33,18 @@ set -u
 
 pip install filecheck
 
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+# Detect system architecture to set conda repo path
+ARCH=$(uname -m)
+if [[ "$ARCH" == "x86_64" ]]; then
+    ARCH_SUFFIX="amd64"
+elif [[ "$ARCH" == "aarch64" ]]; then
+    ARCH_SUFFIX="arm64"
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh
index 832e2cbf7..e545bee49 100755
--- a/ci/test_simulator.sh
+++ b/ci/test_simulator.sh
@@ -21,7 +21,18 @@ set -u
 
 pip install filecheck
 
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
+# Detect system architecture to set conda repo path
+ARCH=$(uname -m)
+if [[ "$ARCH" == "x86_64" ]]; then
+    ARCH_SUFFIX="amd64"
+elif [[ "$ARCH" == "aarch64" ]]; then
+    ARCH_SUFFIX="arm64"
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"

From f9d9e22cc2233498b2db96a261de325d9902f0db Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Wed, 27 Aug 2025 09:35:29 -0700
Subject: [PATCH 20/62] Change numpy ver for py3.9 build to 2.0.2 to support
 1.x and 2.x

---
 conda/recipes/numba-cuda/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index 7a26d6ad0..658cfdeac 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -26,7 +26,7 @@ requirements:
     - pip
     - setuptools
     - numpy >=2.1.0  # [py >= 310]
-    - numpy ==1.26.0  # [py == 39]
+    - numpy ==2.0.2  # [py == 39]
   run:
     - python
     - numba >=0.59.1

From 24c5fda8af95ba2500101da955fee2f07bffdf61 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Wed, 27 Aug 2025 15:24:19 -0700
Subject: [PATCH 21/62] CI with pip install -e to ensure C extensions are
 properly linked

---
 conda/recipes/numba-cuda/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index 658cfdeac..b0876963d 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -18,7 +18,7 @@ source:
 build:
   noarch: python
   script:
-    - {{ PYTHON }} -m pip install . -vv
+    - {{ PYTHON }} -m pip install . -e -vv
 
 requirements:
   host:

From f289d122055504abfa30cf5ee74b7b52cb115000 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Wed, 27 Aug 2025 15:25:25 -0700
Subject: [PATCH 22/62] Fix CI build

---
 conda/recipes/numba-cuda/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index b0876963d..65cef4692 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -18,7 +18,7 @@ source:
 build:
   noarch: python
   script:
-    - {{ PYTHON }} -m pip install . -e -vv
+    - {{ PYTHON }} -m pip install -e . -vv
 
 requirements:
   host:

From 3c48a3d1946db3018ed0ba1f07208e4f5e722e04 Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Fri, 29 Aug 2025 16:37:19 -0700
Subject: [PATCH 23/62] Hack C extensions into import mechanism

---
 numba_cuda/numba/cuda/cext/__init__.py       | 91 ++++++++++++++++++++
 numba_cuda/numba/cuda/cext/_devicearray.cpp  |  2 +-
 numba_cuda/numba/cuda/cext/_devicearray.h    |  1 +
 numba_cuda/numba/cuda/cext/_dispatcher.cpp   |  6 +-
 numba_cuda/numba/cuda/cudadrv/devicearray.py |  2 +-
 numba_cuda/numba/cuda/cudadrv/driver.py      |  2 +-
 numba_cuda/numba/cuda/dispatcher.py          |  2 +-
 setup.py                                     |  4 +-
 8 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/numba_cuda/numba/cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py
index e69de29bb..4b1816572 100644
--- a/numba_cuda/numba/cuda/cext/__init__.py
+++ b/numba_cuda/numba/cuda/cext/__init__.py
@@ -0,0 +1,91 @@
+import sys
+import importlib
+import importlib.util
+import importlib.machinery
+from pathlib import Path
+from types import ModuleType
+from importlib.machinery import ModuleSpec
+
+
+def _load_ext_from_spec(
+    spec: ModuleSpec, fullname: str, legacy_name: str
+) -> ModuleType:
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[fullname] = module
+    sys.modules[legacy_name] = (
+        module  # Register under legacy name for C extensions
+    )
+
+    # Ensure parent modules exist for legacy name (e.g., numba_cuda for numba_cuda._devicearray)
+    parts = legacy_name.split(".")
+    for i in range(1, len(parts)):
+        parent_name = ".".join(parts[:i])
+        if parent_name not in sys.modules:
+            parent_module = ModuleType(parent_name)
+            sys.modules[parent_name] = parent_module
+
+        # Set the child as an attribute of the parent
+        parent_module = sys.modules[parent_name]
+        child_name = parts[i]
+        if i == len(parts) - 1:  # This is the final module
+            setattr(parent_module, child_name, module)
+        elif not hasattr(parent_module, child_name):
+            # Create intermediate module if it doesn't exist
+            intermediate_name = ".".join(parts[: i + 1])
+            if intermediate_name not in sys.modules:
+                intermediate_module = ModuleType(intermediate_name)
+                sys.modules[intermediate_name] = intermediate_module
+                setattr(parent_module, child_name, intermediate_module)
+
+    spec.loader.exec_module(module)
+    return module
+
+
+def _find_in_dir(
+    module_name: str, directory: Path | str | None
+) -> ModuleSpec | None:
+    if not directory:
+        return None
+    return importlib.machinery.PathFinder.find_spec(
+        module_name, [str(directory)]
+    )
+
+
+def _load_cext_module(
+    module_basename: str, required: bool = True
+) -> ModuleType | None:
+    fullname = f"numba.cuda.cext.{module_basename}"
+    legacy_name = f"numba_cuda.{module_basename}"
+
+    # 1) Try local numba_cuda directory (for development builds)
+    local_numba_cuda = Path(__file__).parents[
+        3
+    ]  # Go up from cext/ to numba_cuda/
+    spec = _find_in_dir(module_basename, local_numba_cuda)
+
+    # 2) Fallback: scan sys.path for installed numba_cuda directory
+    if spec is None:
+        for entry in sys.path:
+            numba_cuda_dir = Path(entry) / "numba_cuda"
+            spec = _find_in_dir(module_basename, numba_cuda_dir)
+            if spec is not None:
+                break
+
+    if spec is None:
+        if required:
+            raise ModuleNotFoundError(
+                f"Could not find '{module_basename}' in numba_cuda directories"
+            )
+        return None
+
+    return _load_ext_from_spec(spec, fullname, legacy_name)
+
+
+# Load known cext modules (all required)
+# Load _devicearray first since _dispatcher depends on it
+_devicearray = _load_cext_module("_devicearray", required=True)
+_dispatcher = _load_cext_module("_dispatcher", required=True)
+mviewbuf = _load_cext_module("mviewbuf", required=True)
+
+__all__ = ["mviewbuf", "_dispatcher", "_devicearray"]
diff --git a/numba_cuda/numba/cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp
index 645aafdce..cdfc73595 100644
--- a/numba_cuda/numba/cuda/cext/_devicearray.cpp
+++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp
@@ -120,7 +120,7 @@ MOD_INIT(_devicearray) {
     if (m == NULL)
         goto error_occurred;
 
-    c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda._devicearray._DEVICEARRAY_API", NULL);
+    c_api = PyCapsule_New((void *)_DeviceArray_API, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API", NULL);
     if (c_api == NULL)
         goto error_occurred;
 
diff --git a/numba_cuda/numba/cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h
index 5b276eacf..ab469553a 100644
--- a/numba_cuda/numba/cuda/cext/_devicearray.h
+++ b/numba_cuda/numba/cuda/cext/_devicearray.h
@@ -5,6 +5,7 @@
     extern "C" {
 #endif
 
+#define NUMBA_DEVICEARRAY_IMPORT_NAME "numba_cuda._devicearray"
 /* These definitions should only be used by consumers of the Device Array API.
  * Consumers access the API through the opaque pointer stored in
  * _devicearray._DEVICEARRAY_API.  We don't want these definitions in
diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
index 2e8e1a882..fc3757a10 100644
--- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp
+++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
@@ -935,13 +935,13 @@ Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws)
 static int
 import_devicearray(void)
 {
-    PyObject *devicearray = PyImport_ImportModule("numba_cuda._devicearray");
+    PyObject *devicearray = PyImport_ImportModule(NUMBA_DEVICEARRAY_IMPORT_NAME);
     if (devicearray == NULL) {
         return -1;
     }
     Py_DECREF(devicearray);
 
-    DeviceArray_API = (void**)PyCapsule_Import("numba_cuda._devicearray._DEVICEARRAY_API", 0);
+    DeviceArray_API = (void**)PyCapsule_Import(NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API", 0);
     if (DeviceArray_API == NULL) {
         return -1;
     }
@@ -1056,7 +1056,7 @@ static PyMethodDef ext_methods[] = {
 MOD_INIT(_dispatcher) {
     if (import_devicearray() < 0) {
       PyErr_Print();
-      PyErr_SetString(PyExc_ImportError, "numba_cuda._devicearray failed to import");
+      PyErr_SetString(PyExc_ImportError, NUMBA_DEVICEARRAY_IMPORT_NAME " failed to import");
       return MOD_ERROR_VAL;
     }
 
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index 31d50abe2..e0799894f 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 import numba
-from numba_cuda import _devicearray
+from numba.cuda.cext import _devicearray
 from numba.cuda.cudadrv import devices, dummyarray
 from numba.cuda.cudadrv import driver as _driver
 from numba.core import types, config
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index f88fa6712..84f94c1a1 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -43,7 +43,7 @@
 from collections import namedtuple, deque
 
 
-from numba.cuda import mviewbuf
+from numba.cuda.cext import mviewbuf
 from numba.core import config
 from numba.cuda import utils, serialize
 from .error import CudaSupportError, CudaDriverError
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index a88093e31..f471f11ff 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -38,7 +38,7 @@
 from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
 
 from numba import cuda
-from numba.cuda import _dispatcher
+from numba.cuda.cext import _dispatcher
 
 from warnings import warn
 
diff --git a/setup.py b/setup.py
index a0a637bc2..7ab1dd3af 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ def get_ext_modules():
         install_name_tool_fixer = ["-headerpad_max_install_names"]
 
     ext_mviewbuf = Extension(
-        name="numba_cuda.numba.cuda.mviewbuf",
+        name="numba_cuda.mviewbuf",
         extra_link_args=install_name_tool_fixer,
         sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"],
     )
@@ -65,7 +65,7 @@ def get_ext_modules():
         "numba_cuda/numba/cuda/cext/typeconv.cpp",
     ]
     ext_dispatcher = Extension(
-        name="numba_cuda.numba.cuda._dispatcher",
+        name="numba_cuda._dispatcher",
         sources=dispatcher_sources,
         depends=[
             "numba_cuda/numba/cuda/cext/_pymodule.h",

From 618a885b3fd007ad7131a16aa024f553802b6c00 Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Fri, 29 Aug 2025 16:48:27 -0700
Subject: [PATCH 24/62] Fix redirector

---
 site-packages/_numba_cuda_redirector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py
index 1c76609ac..ffeca64a5 100644
--- a/site-packages/_numba_cuda_redirector.py
+++ b/site-packages/_numba_cuda_redirector.py
@@ -31,7 +31,7 @@ def ensure_initialized(self):
 
         numba_cuda_spec = importlib.util.find_spec("numba_cuda")
 
-        if numba_spec is None:
+        if numba_cuda_spec is None:
             warnings.warn(no_spec_msg.format("numba_cuda"))
             self.initialized = False
             return False

From 582b66ecf5ba14cac25991dee988cb29a0d43f88 Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Sat, 30 Aug 2025 09:13:57 -0700
Subject: [PATCH 25/62] Shorten CI runs for development

---
 ci/matrix.yml    | 36 ++++++++++++++++++------------------
 ci/test_conda.sh |  1 +
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/ci/matrix.yml b/ci/matrix.yml
index 247027305..e11ea7e12 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -1,24 +1,24 @@
 build-matrix:
-  - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  # - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 simulator-matrix:
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 test-matrix:
-  - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest' }
-  - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4',   DRIVER: 'latest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest' }
+  # - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest' }
+  # - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4',   DRIVER: 'latest' }
+  # - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest' }
   - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest' }
-  - { CUDA_VER: '12.8.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest' }
-  - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest' }
-  - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' }
-  - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' }
-  - { CUDA_VER: '12.8.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest' }
+  # - { CUDA_VER: '12.8.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest' }
+  # - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest' }
+  # - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest' }
+  # - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' }
+  # - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' }
+  # - { CUDA_VER: '12.8.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest' }
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 07ea5dbf5..ad124c630 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -2,6 +2,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION
 
 set -euo pipefail
+set -x
 
 . /opt/conda/etc/profile.d/conda.sh
 

From 203e152f2567253390b40b93d1d4c8c1644c4fbf Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Sat, 30 Aug 2025 09:28:56 -0700
Subject: [PATCH 26/62] Debugging CI

---
 ci/test_conda.sh                 | 5 +++++
 numba_cuda/numba/cuda/testing.py | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index ad124c630..958db3f5d 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -76,6 +76,11 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
 print(test_dir)
 "
 
+python -c "import numba.cuda as cuda; print(cuda.__file__)"
+python -c "import numba_cuda as cuda; print(cuda.__file__)"
+pip freeze | grep numba
+python -c "import numba.cuda.testing; print(numba.cuda.testing.test_binary_generation_dir())"
+
 CUDA_VER_MAJOR_MINOR=${CUDA_VER%.*}
 if [ "${CUDA_VER_MAJOR_MINOR%.*}" == "11" ]
 then
diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py
index 91d1cf243..1d7866a99 100644
--- a/numba_cuda/numba/cuda/testing.py
+++ b/numba_cuda/numba/cuda/testing.py
@@ -348,3 +348,9 @@ class ForeignArray(object):
     def __init__(self, arr):
         self._arr = arr
         self.__cuda_array_interface__ = arr.__cuda_array_interface__
+
+
+def test_binary_generation_dir():
+    testing = Path(__file__)
+    binary_gen_dir = testing.parent / "tests" / "test_binary_generation"
+    return binary_gen_dir

From 12026f895353a38ea19698b0a6e832e317990381 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 2 Sep 2025 19:10:49 +0100
Subject: [PATCH 27/62] Use platform-dependent wheel path

---
 ci/test_wheel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index a6c474fde..bdc6b50b9 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -7,7 +7,7 @@ set -euo pipefail
 CUDA_VER_MAJOR_MINOR=${CUDA_VER%.*}
 
 rapids-logger "Install wheel with test dependencies"
-package=$(realpath wheel/numba_cuda*.whl)
+package=$(realpath wheel*/numba_cuda*.whl)
 echo "Package path: ${package}"
 
 DEPENDENCIES=(

From 4dba8dbcca1264b91715e41864005a1b9e84696d Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 2 Sep 2025 19:29:37 +0100
Subject: [PATCH 28/62] Build modules in cext

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 4f959e043..cb5d4b4db 100644
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@ def get_ext_modules():
         ]
 
     ext_devicearray = Extension(
-        name="numba_cuda._devicearray",
+        name="numba_cuda.numba.cuda.cext._devicearray",
         sources=["numba_cuda/numba/cuda/cext/_devicearray.cpp"],
         depends=[
             "numba_cuda/numba/cuda/cext/_pymodule.h",
@@ -55,7 +55,7 @@ def get_ext_modules():
         install_name_tool_fixer = ["-headerpad_max_install_names"]
 
     ext_mviewbuf = Extension(
-        name="numba_cuda.mviewbuf",
+        name="numba_cuda.numba.cuda.cext.mviewbuf",
         extra_link_args=install_name_tool_fixer,
         sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"],
     )
@@ -67,7 +67,7 @@ def get_ext_modules():
         "numba_cuda/numba/cuda/cext/typeconv.cpp",
     ]
     ext_dispatcher = Extension(
-        name="numba_cuda._dispatcher",
+        name="numba_cuda.numba.cuda.cext._dispatcher",
         sources=dispatcher_sources,
         depends=[
             "numba_cuda/numba/cuda/cext/_pymodule.h",

From 486dfae03c82dcfd6e685378f920907d61d1e672 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 2 Sep 2025 19:32:49 +0100
Subject: [PATCH 29/62] Don't use editable install in conda build

---
 conda/recipes/numba-cuda/meta.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index f792814bc..8016ca6d3 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -17,9 +17,8 @@ source:
   path: ../../..
 
 build:
-  noarch: python
   script:
-    - {{ PYTHON }} -m pip install -e . -vv
+    - {{ PYTHON }} -m pip install . -vv
 
 requirements:
   host:

From a93e465b8c1d9c908e26f00b11ae39574049a143 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 2 Sep 2025 19:33:35 +0100
Subject: [PATCH 30/62] Use correct search path for C extensions

---
 numba_cuda/numba/cuda/cext/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py
index 4b1816572..c54155bae 100644
--- a/numba_cuda/numba/cuda/cext/__init__.py
+++ b/numba_cuda/numba/cuda/cext/__init__.py
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
 import sys
 import importlib
 import importlib.util
@@ -67,7 +70,7 @@ def _load_cext_module(
     # 2) Fallback: scan sys.path for installed numba_cuda directory
     if spec is None:
         for entry in sys.path:
-            numba_cuda_dir = Path(entry) / "numba_cuda"
+            numba_cuda_dir = Path(entry) / "numba_cuda/numba/cuda/cext"
             spec = _find_in_dir(module_basename, numba_cuda_dir)
             if spec is not None:
                 break

From 9b59060e9f3977aeeaf2991b60153a01cc1bf969 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 2 Sep 2025 19:46:05 +0100
Subject: [PATCH 31/62] Remove debug prints

---
 ci/matrix.yml    | 9 +++++++++
 ci/test_conda.sh | 5 -----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/ci/matrix.yml b/ci/matrix.yml
index 6cf2e4dac..7ab320d2f 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -5,7 +5,16 @@
 #     [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES]
 #
 build-matrix:
+  - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 simulator-matrix:
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 # We test "oldest" dependencies with the oldest supported Python version and
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 321a75eeb..cf51a7522 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -98,11 +98,6 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\"
 print(test_dir)
 "
 
-python -c "import numba.cuda as cuda; print(cuda.__file__)"
-python -c "import numba_cuda as cuda; print(cuda.__file__)"
-pip freeze | grep numba
-python -c "import numba.cuda.testing; print(numba.cuda.testing.test_binary_generation_dir())"
-
 rapids-logger "Build tests"
 
 export NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$GET_TEST_BINARY_DIR")

From 1658db671ba14e41697025b41c42f61fbe09d6e9 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 2 Sep 2025 20:05:05 +0100
Subject: [PATCH 32/62] COrrect conda-repo location

---
 ci/test_conda.sh | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index cf51a7522..b0e799d98 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -45,10 +45,25 @@ if [ "${RAPIDS_DEPENDENCIES:-}" = "oldest" ]; then
     DEPENDENCIES+=("numba==0.60.0")
 fi
 
+# Detect system architecture to set conda repo path
+ARCH=$(uname -m)
+if [[ "$ARCH" == "x86_64" ]]; then
+    ARCH_SUFFIX="amd64"
+elif [[ "$ARCH" == "aarch64" ]]; then
+    ARCH_SUFFIX="arm64"
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+PY="${RAPIDS_PY_VER//./}"
+
+repo=`pwd`/conda-repo-py${PY}-${ARCH}
+
 rapids-mamba-retry create \
     -n test \
     --strict-channel-priority \
-    --channel "`pwd`/conda-repo" \
+    --channel "$repo" \
     --channel conda-forge \
     "${DEPENDENCIES[@]}"
 
@@ -59,18 +74,7 @@ set -u
 
 pip install filecheck
 
-# Detect system architecture to set conda repo path
-ARCH=$(uname -m)
-if [[ "$ARCH" == "x86_64" ]]; then
-    ARCH_SUFFIX="amd64"
-elif [[ "$ARCH" == "aarch64" ]]; then
-    ARCH_SUFFIX="arm64"
-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
-
-rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
+rapids-mamba-retry install -c ${repo} numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"

From 6349ad72dc9526a4482affc6647b6a61e14447d4 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 10:50:13 +0100
Subject: [PATCH 33/62] Again try to correct conda repo location

---
 ci/test_conda.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index b0e799d98..94feef921 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -56,9 +56,7 @@ else
     exit 1
 fi
 
-PY="${RAPIDS_PY_VER//./}"
-
-repo=`pwd`/conda-repo-py${PY}-${ARCH}
+repo=`pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH}
 
 rapids-mamba-retry create \
     -n test \

From 727e6f8fa6bc5a2bf03c162dba1381422930b44b Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 10:59:18 +0100
Subject: [PATCH 34/62] Correct usage of `download-artifact`

The artiface name should be given as `with.name`, not just `name` -
`name` just names the step.
---
 .github/workflows/conda-python-tests.yaml  | 3 ++-
 .github/workflows/docs-build.yaml          | 3 ++-
 .github/workflows/simulator-test.yaml      | 3 ++-
 .github/workflows/wheel-windows-tests.yaml | 3 ++-
 .github/workflows/wheels-test.yaml         | 3 ++-
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml
index fcedcf17c..629b4d0fd 100644
--- a/.github/workflows/conda-python-tests.yaml
+++ b/.github/workflows/conda-python-tests.yaml
@@ -97,7 +97,8 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+        with:
+          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml
index 87f6bc58d..945766078 100644
--- a/.github/workflows/docs-build.yaml
+++ b/.github/workflows/docs-build.yaml
@@ -59,7 +59,8 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+        with:
+          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml
index 78f3e85b8..d76dca639 100644
--- a/.github/workflows/simulator-test.yaml
+++ b/.github/workflows/simulator-test.yaml
@@ -86,7 +86,8 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+        with:
+          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index 1847ccf7e..dd7ce65cf 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -49,7 +49,8 @@ jobs:
         run: nvidia-smi
 
       - uses: actions/download-artifact@v4
-        name: wheel
+        with:
+          name: wheel
 
       - name: Display structure of downloaded files
         run: Get-ChildItem -Recurse
diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml
index 4fd38c889..4c6f5564a 100644
--- a/.github/workflows/wheels-test.yaml
+++ b/.github/workflows/wheels-test.yaml
@@ -115,7 +115,8 @@ jobs:
         fetch-depth: 0 # unshallow fetch for setuptools-scm
         persist-credentials: false
     - uses: actions/download-artifact@v4
-      name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+      with:
+        name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
     - name: Display structure of downloaded files
       run: ls -R
     - name: Standardize repository information

From dc1bbb83805c8fda2d953824fa7637f8323a9953 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 12:12:32 +0100
Subject: [PATCH 35/62] Revert "Again try to correct conda repo location"

This reverts commit 6349ad72dc9526a4482affc6647b6a61e14447d4.
---
 ci/test_conda.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 94feef921..b0e799d98 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -56,7 +56,9 @@ else
     exit 1
 fi
 
-repo=`pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH}
+PY="${RAPIDS_PY_VER//./}"
+
+repo=`pwd`/conda-repo-py${PY}-${ARCH}
 
 rapids-mamba-retry create \
     -n test \

From fc3c577b3a429e8ddb0d83a4320379d0191779f1 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 12:12:38 +0100
Subject: [PATCH 36/62] Revert "COrrect conda-repo location"

This reverts commit 1658db671ba14e41697025b41c42f61fbe09d6e9.
---
 ci/test_conda.sh | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index b0e799d98..cf51a7522 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -45,25 +45,10 @@ if [ "${RAPIDS_DEPENDENCIES:-}" = "oldest" ]; then
     DEPENDENCIES+=("numba==0.60.0")
 fi
 
-# Detect system architecture to set conda repo path
-ARCH=$(uname -m)
-if [[ "$ARCH" == "x86_64" ]]; then
-    ARCH_SUFFIX="amd64"
-elif [[ "$ARCH" == "aarch64" ]]; then
-    ARCH_SUFFIX="arm64"
-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
-
-PY="${RAPIDS_PY_VER//./}"
-
-repo=`pwd`/conda-repo-py${PY}-${ARCH}
-
 rapids-mamba-retry create \
     -n test \
     --strict-channel-priority \
-    --channel "$repo" \
+    --channel "`pwd`/conda-repo" \
     --channel conda-forge \
     "${DEPENDENCIES[@]}"
 
@@ -74,7 +59,18 @@ set -u
 
 pip install filecheck
 
-rapids-mamba-retry install -c ${repo} numba-cuda
+# Detect system architecture to set conda repo path
+ARCH=$(uname -m)
+if [[ "$ARCH" == "x86_64" ]]; then
+    ARCH_SUFFIX="amd64"
+elif [[ "$ARCH" == "aarch64" ]]; then
+    ARCH_SUFFIX="arm64"
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"

From 1e507b54ef25004d676d62530405da6cccc33e7c Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 12:17:31 +0100
Subject: [PATCH 37/62] Another attempt to fix artifact locations / paths

---
 .github/workflows/conda-python-tests.yaml |  1 +
 .github/workflows/docs-build.yaml         |  1 +
 .github/workflows/simulator-test.yaml     |  1 +
 .github/workflows/wheels-test.yaml        |  1 +
 ci/build_docs.sh                          | 13 +------------
 ci/test_conda.sh                          |  2 +-
 ci/test_conda_ctypes_binding.sh           | 13 +------------
 ci/test_simulator.sh                      | 13 +------------
 ci/test_wheel.sh                          |  2 +-
 9 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml
index 629b4d0fd..c438ebbb8 100644
--- a/.github/workflows/conda-python-tests.yaml
+++ b/.github/workflows/conda-python-tests.yaml
@@ -99,6 +99,7 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+          path: conda-repo
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml
index 945766078..9dcae3bbe 100644
--- a/.github/workflows/docs-build.yaml
+++ b/.github/workflows/docs-build.yaml
@@ -61,6 +61,7 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+          path: conda-repo
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml
index d76dca639..fe306bc26 100644
--- a/.github/workflows/simulator-test.yaml
+++ b/.github/workflows/simulator-test.yaml
@@ -88,6 +88,7 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+          path: conda-repo
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml
index 4c6f5564a..afa05076f 100644
--- a/.github/workflows/wheels-test.yaml
+++ b/.github/workflows/wheels-test.yaml
@@ -117,6 +117,7 @@ jobs:
     - uses: actions/download-artifact@v4
       with:
         name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+        path: wheel
     - name: Display structure of downloaded files
       run: ls -R
     - name: Standardize repository information
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index ed1d087cf..b4a791446 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -29,18 +29,7 @@ set +u
 conda activate docs
 set -u
 
-# Detect system architecture to set conda repo path
-ARCH=$(uname -m)
-if [[ "$ARCH" == "x86_64" ]]; then
-    ARCH_SUFFIX="amd64"
-elif [[ "$ARCH" == "aarch64" ]]; then
-    ARCH_SUFFIX="arm64"
-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
-
-rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
 
 pip install nvidia-sphinx-theme
 
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index cf51a7522..47fb3d01f 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -70,7 +70,7 @@ else
     exit 1
 fi
 
-rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh
index 835d33b38..5faaddedb 100755
--- a/ci/test_conda_ctypes_binding.sh
+++ b/ci/test_conda_ctypes_binding.sh
@@ -44,18 +44,7 @@ set -u
 
 pip install filecheck
 
-# Detect system architecture to set conda repo path
-ARCH=$(uname -m)
-if [[ "$ARCH" == "x86_64" ]]; then
-    ARCH_SUFFIX="amd64"
-elif [[ "$ARCH" == "aarch64" ]]; then
-    ARCH_SUFFIX="arm64"
-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
-
-rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh
index a17618939..c11e2f5b7 100755
--- a/ci/test_simulator.sh
+++ b/ci/test_simulator.sh
@@ -31,18 +31,7 @@ set -u
 
 pip install filecheck
 
-# Detect system architecture to set conda repo path
-ARCH=$(uname -m)
-if [[ "$ARCH" == "x86_64" ]]; then
-    ARCH_SUFFIX="amd64"
-elif [[ "$ARCH" == "aarch64" ]]; then
-    ARCH_SUFFIX="arm64"
-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
-
-rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda
+rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index bdc6b50b9..a6c474fde 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -7,7 +7,7 @@ set -euo pipefail
 CUDA_VER_MAJOR_MINOR=${CUDA_VER%.*}
 
 rapids-logger "Install wheel with test dependencies"
-package=$(realpath wheel*/numba_cuda*.whl)
+package=$(realpath wheel/numba_cuda*.whl)
 echo "Package path: ${package}"
 
 DEPENDENCIES=(

From 42654425a885eeb6c410f28df185ba3678b8b4b3 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 13:05:32 +0100
Subject: [PATCH 38/62] Remove Python 3.9 from matrix

---
 ci/matrix.yml | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/ci/matrix.yml b/ci/matrix.yml
index 7ab320d2f..f810738c1 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -5,12 +5,10 @@
 #     [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES]
 #
 build-matrix:
-  - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-  - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
@@ -20,17 +18,17 @@ simulator-matrix:
 # We test "oldest" dependencies with the oldest supported Python version and
 # the second-newest Python version.
 test-matrix:
-  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.10',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.10',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }

From 112f3c9ab96374336c45e3f6808e4069207d41d2 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 13:07:27 +0100
Subject: [PATCH 39/62] Fix up pre-commit violations

---
 numba_cuda/numba/cuda/cext/_devicearray.cpp | 3 +++
 numba_cuda/numba/cuda/cext/_devicearray.h   | 3 +++
 numba_cuda/numba/cuda/cext/_dispatcher.cpp  | 3 +++
 numba_cuda/numba/cuda/cext/_hashtable.cpp   | 3 +++
 numba_cuda/numba/cuda/cext/_hashtable.h     | 3 +++
 numba_cuda/numba/cuda/cext/_pymodule.h      | 3 +++
 numba_cuda/numba/cuda/cext/_typeof.cpp      | 3 +++
 numba_cuda/numba/cuda/cext/_typeof.h        | 3 +++
 numba_cuda/numba/cuda/cext/typeconv.cpp     | 3 +++
 9 files changed, 27 insertions(+)

diff --git a/numba_cuda/numba/cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp
index cdfc73595..60e11a69f 100644
--- a/numba_cuda/numba/cuda/cext/_devicearray.cpp
+++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 /* This file contains the base class implementation for all device arrays. The
  * base class is implemented in C so that computing typecodes for device arrays
  * can be implemented efficiently. */
diff --git a/numba_cuda/numba/cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h
index ab469553a..e16726983 100644
--- a/numba_cuda/numba/cuda/cext/_devicearray.h
+++ b/numba_cuda/numba/cuda/cext/_devicearray.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #ifndef NUMBA_DEVICEARRAY_H_
 #define NUMBA_DEVICEARRAY_H_
 
diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
index fc3757a10..4f5d20b6c 100644
--- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp
+++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #include "_pymodule.h"
 
 #include <cstring>
diff --git a/numba_cuda/numba/cuda/cext/_hashtable.cpp b/numba_cuda/numba/cuda/cext/_hashtable.cpp
index d926256d3..7d5dda962 100644
--- a/numba_cuda/numba/cuda/cext/_hashtable.cpp
+++ b/numba_cuda/numba/cuda/cext/_hashtable.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 /*
  * This file and _hashtable.h are from CPython 3.5.  The symbols have been
  * renamed from _Py_hashxxx to _Numba_hashxxx to avoid name clashes with
diff --git a/numba_cuda/numba/cuda/cext/_hashtable.h b/numba_cuda/numba/cuda/cext/_hashtable.h
index fbc6d6013..9fb719472 100644
--- a/numba_cuda/numba/cuda/cext/_hashtable.h
+++ b/numba_cuda/numba/cuda/cext/_hashtable.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 /*
  * See _hashtable.c for more information about this file.
  */
diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h
index 47eedcb8a..cff21ebe7 100644
--- a/numba_cuda/numba/cuda/cext/_pymodule.h
+++ b/numba_cuda/numba/cuda/cext/_pymodule.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #ifndef NUMBA_PY_MODULE_H_
 #define NUMBA_PY_MODULE_H_
 
diff --git a/numba_cuda/numba/cuda/cext/_typeof.cpp b/numba_cuda/numba/cuda/cext/_typeof.cpp
index 4f03d0070..6730b6d6f 100644
--- a/numba_cuda/numba/cuda/cext/_typeof.cpp
+++ b/numba_cuda/numba/cuda/cext/_typeof.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #include "_pymodule.h"
 
 #include <string.h>
diff --git a/numba_cuda/numba/cuda/cext/_typeof.h b/numba_cuda/numba/cuda/cext/_typeof.h
index 6e0039b5f..54e39ff80 100644
--- a/numba_cuda/numba/cuda/cext/_typeof.h
+++ b/numba_cuda/numba/cuda/cext/_typeof.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #ifndef NUMBA_TYPEOF_H_
 #define NUMBA_TYPEOF_H_
 
diff --git a/numba_cuda/numba/cuda/cext/typeconv.cpp b/numba_cuda/numba/cuda/cext/typeconv.cpp
index 3c51fdfa7..5af7d16b3 100644
--- a/numba_cuda/numba/cuda/cext/typeconv.cpp
+++ b/numba_cuda/numba/cuda/cext/typeconv.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #include <cstring>
 #include <cstdio>
 #include <algorithm>

From 0cf1ca8ca8110ddea538336ba49b7859219a2d2f Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 13:08:55 +0100
Subject: [PATCH 40/62] Fix up pre-commit violations

---
 numba_cuda/numba/cuda/cext/mviewbuf.c   | 3 +++
 numba_cuda/numba/cuda/cext/typeconv.hpp | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/numba_cuda/numba/cuda/cext/mviewbuf.c b/numba_cuda/numba/cuda/cext/mviewbuf.c
index 33f0888c7..eb5075d0b 100644
--- a/numba_cuda/numba/cuda/cext/mviewbuf.c
+++ b/numba_cuda/numba/cuda/cext/mviewbuf.c
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #include "_pymodule.h"
 
 static int get_writable_buffer(PyObject* obj, Py_buffer *buf, int force)
diff --git a/numba_cuda/numba/cuda/cext/typeconv.hpp b/numba_cuda/numba/cuda/cext/typeconv.hpp
index 1f3cb9359..da5d87a77 100644
--- a/numba_cuda/numba/cuda/cext/typeconv.hpp
+++ b/numba_cuda/numba/cuda/cext/typeconv.hpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
 #ifndef NUMBA_TYPECONV_HPP_
 #define NUMBA_TYPECONV_HPP_
 #include <string>

From e7c5c9179b6ce9ad4f82b0e4972c61a9b8921230 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 13:20:04 +0100
Subject: [PATCH 41/62] Attempt to fix docs build repo

---
 .github/workflows/docs-build.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml
index 9dcae3bbe..83c78fe8d 100644
--- a/.github/workflows/docs-build.yaml
+++ b/.github/workflows/docs-build.yaml
@@ -49,7 +49,7 @@ jobs:
     env:
       RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
     container:
-      image: rapidsai/ci-conda:latest
+      image: rapidsai/ci-conda:cuda13.0.0-ubuntu24.04-py3.13
       env:
         RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
     steps:
@@ -60,7 +60,7 @@ jobs:
           fetch-depth: 0
       - uses: actions/download-artifact@v4
         with:
-          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+          name: conda-repo-py3.13-amd64
           path: conda-repo
       - name: Display structure of downloaded files
         run: ls -R

From f3948a079c44d13a5e99e7e2bad9d2e203bbc679 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 14:02:53 +0100
Subject: [PATCH 42/62] Fix matrix for coverage report

---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index da52b2d39..92b67c5dc 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -169,4 +169,4 @@ jobs:
       build_type: pull-request
       script: "ci/coverage_report.sh"
       matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
-      matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.11")) | .[0:1]'
+      matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.12")) | .[0:1]'

From 199439796f96d359761d9b0ffccd67b6338debc4 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 14:38:11 +0100
Subject: [PATCH 43/62] Build wheel on Windows

---
 .github/workflows/pr.yaml                  |  5 +-
 .github/workflows/wheel-windows-build.yaml | 58 ++++++++++++++++++++++
 .github/workflows/wheel-windows-tests.yaml |  3 +-
 ci/build_wheel.ps1                         | 33 ++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/wheel-windows-build.yaml
 create mode 100644 ci/build_wheel.ps1

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 92b67c5dc..701725134 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -22,6 +22,7 @@ jobs:
       - test-conda-ctypes-binding
       - test-simulator
       - build-wheels
+      - build-wheels-windows
       - test-wheels-windows
       - test-wheels
       - test-wheels-ctypes-binding
@@ -108,9 +109,11 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel.sh"
       matrix: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
+  build-wheels-windows:
+    uses: ./.github/workflows/wheel-windows-build.yaml
   test-wheels-windows:
     needs:
-      - build-wheels
+      - build-wheels-windows
       - compute-matrix
     uses: ./.github/workflows/wheel-windows-tests.yaml
   test-wheels:
diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
new file mode 100644
index 000000000..9bbc7cd2e
--- /dev/null
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+name: "CI: Build wheel on Windows"
+
+on:
+  workflow_call:
+    inputs:
+      sha:
+        type: string
+      repo:
+        type: string
+      script:
+        type: string
+        default: "./ci/test_wheel.ps1"
+
+jobs:
+  compute-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Compute Python Test Matrix
+        id: compute-matrix
+        run: |
+          set -eo pipefail
+          export TEST_MATRIX="{ ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0' }"
+          MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c)
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+  build:
+    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, windows
+    needs: compute-matrix
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }}
+    runs-on: "cuda-python-windows-gpu-github"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0
+      - name: Set up Python ${{ matrix.PY_VER }}
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
+        with:
+          python-version: ${{ matrix.PY_VER }}
+
+      - name: Python build
+        run: |
+          ${{ inputs.script }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+      - name: Upload wheel
+        if: "!cancelled()"
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}-windows
+          path: ${{ env.wheel_path }}
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index dd7ce65cf..fa53ffc5b 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -50,7 +50,8 @@ jobs:
 
       - uses: actions/download-artifact@v4
         with:
-          name: wheel
+          name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}-windows
+          path: wheel
 
       - name: Display structure of downloaded files
         run: Get-ChildItem -Recurse
diff --git a/ci/build_wheel.ps1 b/ci/build_wheel.ps1
new file mode 100644
index 000000000..24e27aa62
--- /dev/null
+++ b/ci/build_wheel.ps1
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+$ErrorActionPreference = 'Stop'
+Set-StrictMode -Version Latest
+
+function rapids-logger {
+    param (
+        [Parameter(Mandatory=$true)]
+        [string]$Text
+    )
+
+    # Determine padding and box width
+    $padding = 2
+    $boxWidth = $Text.Length + ($padding * 2)
+    $topBottom = '+' + ('-' * $boxWidth) + '+'
+    $middle = '|' + (' ' * $padding) + $Text + (' ' * $padding) + '|'
+
+    # Print the box in green
+    Write-Host $topBottom -ForegroundColor Green
+    Write-Host $middle    -ForegroundColor Green
+    Write-Host $topBottom -ForegroundColor Green
+}
+
+rapids-logger "Install build package"
+python -m pip install build
+
+rapids-logger "Build sdist and wheel"
+python -m build .
+
+$wheel_path = Resolve-Path dist\numba_cuda*.whl | Select-Object -ExpandProperty Path
+echo "Wheel path: $wheel_path"
+echo "wheel_path=$wheel_path" >> $GITHUB_ENV

From faed1784aed5ed6d3eb816df08cf474d1cdfa2cb Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 14:44:45 +0100
Subject: [PATCH 44/62] Use correct script for wheel Windows build

---
 .github/workflows/wheel-windows-build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 9bbc7cd2e..53c00a1ca 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -12,7 +12,7 @@ on:
         type: string
       script:
         type: string
-        default: "./ci/test_wheel.ps1"
+        default: "./ci/build_wheel.ps1"
 
 jobs:
   compute-matrix:

From 9d0ca3a90b22d282bb13a7039b936a9a8073b2fa Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 14:51:20 +0100
Subject: [PATCH 45/62] Add MSVC to Windows build

---
 .github/workflows/wheel-windows-build.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 53c00a1ca..2c724bea2 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -40,16 +40,21 @@ jobs:
           repository: ${{ inputs.repo }}
           ref: ${{ inputs.sha }}
           fetch-depth: 0
+
       - name: Set up Python ${{ matrix.PY_VER }}
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
         with:
           python-version: ${{ matrix.PY_VER }}
 
+      - name: Set up MSVC
+        uses: ilammy/msvc-dev-cmd@v1  # TODO: ask admin to allow pinning commits
+
       - name: Python build
         run: |
           ${{ inputs.script }}
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
+
       - name: Upload wheel
         if: "!cancelled()"
         uses: actions/upload-artifact@v4

From d9cbd9ec3f8b2aa021dfbd1138006bc1c29dbc09 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 15:01:30 +0100
Subject: [PATCH 46/62] Use a different runner for Windows build

---
 .github/workflows/wheel-windows-build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 2c724bea2..fbf52a634 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -33,7 +33,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }}
-    runs-on: "cuda-python-windows-gpu-github"
+    runs-on: windows-2022
     steps:
       - uses: actions/checkout@v4
         with:

From 770813c41fa2c8ef27d4c7532b0f097419a5be84 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 17:10:07 +0100
Subject: [PATCH 47/62] Try using env var for GITHUB_ENV in Windows wheel build

---
 ci/build_wheel.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_wheel.ps1 b/ci/build_wheel.ps1
index 24e27aa62..77608ac54 100644
--- a/ci/build_wheel.ps1
+++ b/ci/build_wheel.ps1
@@ -30,4 +30,4 @@ python -m build .
 
 $wheel_path = Resolve-Path dist\numba_cuda*.whl | Select-Object -ExpandProperty Path
 echo "Wheel path: $wheel_path"
-echo "wheel_path=$wheel_path" >> $GITHUB_ENV
+echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV

From 180c9a45e13a384ce5f41b4c3127d0544d55003a Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 17:36:10 +0100
Subject: [PATCH 48/62] Attempt to have Python matrix for Windows build

---
 .github/workflows/pr.yaml                  | 10 ++++++++++
 .github/workflows/wheel-windows-build.yaml | 10 ++++------
 .github/workflows/wheel-windows-tests.yaml |  9 ++++-----
 ci/matrix.yml                              |  5 +++++
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 701725134..4e8182be6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -49,12 +49,16 @@ jobs:
         id: compute-matrix
         run: |
           BUILD_MATRIX="$(yq '.build-matrix' ci/matrix.yml)"
+          WINDOWS_MATRIX="$(yq '.windows-matrix' ci/matrix.yml)"
           SIMULATOR_MATRIX="$(yq '.simulator-matrix' ci/matrix.yml)"
           TEST_MATRIX="$(yq '.test-matrix' ci/matrix.yml)"
           {
             echo 'BUILD_MATRIX<<EOF'
             echo "${BUILD_MATRIX}"
             echo 'EOF'
+            echo 'WINDOWS_MATRIX<<EOF'
+            echo "${WINDOWS_MATRIX}"
+            echo 'EOF'
             echo 'SIMULATOR_MATRIX<<EOF'
             echo "${SIMULATOR_MATRIX}"
             echo 'EOF'
@@ -110,12 +114,18 @@ jobs:
       script: "ci/build_wheel.sh"
       matrix: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
   build-wheels-windows:
+    needs:
+      - compute-matrix
     uses: ./.github/workflows/wheel-windows-build.yaml
+    with:
+      matrix: ${{ needs.compute-matrix.outputs.WINDOWS_MATRIX }}
   test-wheels-windows:
     needs:
       - build-wheels-windows
       - compute-matrix
     uses: ./.github/workflows/wheel-windows-tests.yaml
+    with:
+      matrix: ${{ needs.compute-matrix.outputs.WINDOWS_MATRIX }}
   test-wheels:
     needs:
       - build-wheels
diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index fbf52a634..22ad66402 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -24,15 +24,13 @@ jobs:
         id: compute-matrix
         run: |
           set -eo pipefail
-          export TEST_MATRIX="{ ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0' }"
-          MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c)
+          MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c)
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   build:
-    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, windows
+    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows
     needs: compute-matrix
     strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }}
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
     runs-on: windows-2022
     steps:
       - uses: actions/checkout@v4
@@ -59,5 +57,5 @@ jobs:
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}-windows
+          name: wheel-py${{ matrix.PY_VER }}-windows
           path: ${{ env.wheel_path }}
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index fa53ffc5b..577a3f63a 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -24,15 +24,14 @@ jobs:
         id: compute-matrix
         run: |
           set -eo pipefail
-          export TEST_MATRIX="{ ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0' }"
-          MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c)
+          MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c)
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   tests:
-    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, windows
+    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows
     needs: compute-matrix
     strategy:
       fail-fast: false
-      matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }}
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
     runs-on: "cuda-python-windows-gpu-github"
     steps:
       - uses: actions/checkout@v4
@@ -50,7 +49,7 @@ jobs:
 
       - uses: actions/download-artifact@v4
         with:
-          name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}-windows
+          name: wheel-py${{ matrix.PY_VER }}-windows
           path: wheel
 
       - name: Display structure of downloaded files
diff --git a/ci/matrix.yml b/ci/matrix.yml
index f810738c1..a844fa1ed 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -13,6 +13,11 @@ build-matrix:
   - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+windows-matrix:
+  - { PY_VER: '3.10', CUDA_VER: '12.8.0' }
+  - { PY_VER: '3.11', CUDA_VER: '12.8.0' }
+  - { PY_VER: '3.12', CUDA_VER: '12.8.0' }
+  - { PY_VER: '3.13', CUDA_VER: '12.8.0' }
 simulator-matrix:
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 # We test "oldest" dependencies with the oldest supported Python version and

From 31399f1d297fd2d1e3ae60ef5ebc565eab507949 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 20:16:26 +0100
Subject: [PATCH 49/62] Add missing output in compute-matrix

---
 .github/workflows/pr.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4e8182be6..1c6df6a3d 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -41,6 +41,7 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }}
+      WINDOWS_MATRIX: ${{ steps.compute-matrix.outputs.WINDOWS_MATRIX }}
       SIMULATOR_MATRIX: ${{ steps.compute-matrix.outputs.SIMULATOR_MATRIX }}
       TEST_MATRIX: ${{ steps.compute-matrix.outputs.TEST_MATRIX }}
     steps:

From bef9816c67b70b9252530d7af85f9012d5137fda Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 20:32:57 +0100
Subject: [PATCH 50/62] Another attempt to fix things

---
 .github/workflows/wheel-windows-build.yaml | 10 ++++++++++
 .github/workflows/wheel-windows-tests.yaml |  8 ++++++++
 2 files changed, 18 insertions(+)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 22ad66402..16c6027a5 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -14,6 +14,15 @@ on:
         type: string
         default: "./ci/build_wheel.ps1"
 
+      # general settings
+      matrix:
+        type: string
+        required: true
+      matrix_filter:
+        type: string
+        default: "."
+
+
 jobs:
   compute-matrix:
     runs-on: ubuntu-latest
@@ -30,6 +39,7 @@ jobs:
     name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows
     needs: compute-matrix
     strategy:
+      fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
     runs-on: windows-2022
     steps:
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index 577a3f63a..e7e627007 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -14,6 +14,14 @@ on:
         type: string
         default: "./ci/test_wheel.ps1"
 
+      # general settings
+      matrix:
+        type: string
+        required: true
+      matrix_filter:
+        type: string
+        default: "."
+
 jobs:
   compute-matrix:
     runs-on: ubuntu-latest

From 145767c051a1b5380a5f55f908ad1c2d0db996a4 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 3 Sep 2025 20:44:29 +0100
Subject: [PATCH 51/62] Another attempt at matrix fix

---
 .github/workflows/wheel-windows-build.yaml | 3 ++-
 .github/workflows/wheel-windows-tests.yaml | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 16c6027a5..ce6199328 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -22,7 +22,6 @@ on:
         type: string
         default: "."
 
-
 jobs:
   compute-matrix:
     runs-on: ubuntu-latest
@@ -31,6 +30,8 @@ jobs:
     steps:
       - name: Compute Python Test Matrix
         id: compute-matrix
+        env:
+          MATRIX: ${{ inputs.matrix }}
         run: |
           set -eo pipefail
           MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c)
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index e7e627007..89cdace2c 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -30,6 +30,8 @@ jobs:
     steps:
       - name: Compute Python Test Matrix
         id: compute-matrix
+        env:
+          MATRIX: ${{ inputs.matrix }}
         run: |
           set -eo pipefail
           MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c)

From 3ecc2ac48e990e506dac89d95e8c1e6a92a79b74 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 10:17:52 +0100
Subject: [PATCH 52/62] Fix potential issue in yaml files

---
 .github/workflows/wheel-windows-build.yaml | 3 ++-
 .github/workflows/wheel-windows-tests.yaml | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index ce6199328..49a60e8f0 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -38,7 +38,8 @@ jobs:
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   build:
     name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows
-    needs: compute-matrix
+    needs:
+      - compute-matrix
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index 89cdace2c..ca37c1059 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -38,7 +38,8 @@ jobs:
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   tests:
     name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows
-    needs: compute-matrix
+    needs:
+      - compute-matrix
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}

From b7eac0c026faed71c1e5cff8eb2f2f94932d39ba Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 10:37:40 +0100
Subject: [PATCH 53/62] Experiment if issue is with matrix

---
 .github/workflows/wheel-windows-build.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 49a60e8f0..2f031a7f2 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -28,7 +28,7 @@ jobs:
     outputs:
       MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
     steps:
-      - name: Compute Python Test Matrix
+      - name: Compute Build Matrix
         id: compute-matrix
         env:
           MATRIX: ${{ inputs.matrix }}
@@ -42,7 +42,11 @@ jobs:
       - compute-matrix
     strategy:
       fail-fast: false
-      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+      matrix:
+        - {"PY_VER": "3.10", "CUDA_VER": "12.8.0"}
+        - {"PY_VER": "3.11", "CUDA_VER": "12.8.0"}
+        - {"PY_VER": "3.12", "CUDA_VER": "12.8.0"}
+        - {"PY_VER": "3.13", "CUDA_VER": "12.8.0"}
     runs-on: windows-2022
     steps:
       - uses: actions/checkout@v4

From af752941dfc9d95d9a1d6004ef4d47981a53181f Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 10:44:46 +0100
Subject: [PATCH 54/62] Does this fix it?

---
 .github/workflows/wheel-windows-build.yaml | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 2f031a7f2..42ac8c702 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -37,16 +37,17 @@ jobs:
           MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c)
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   build:
-    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows
+    name: ${{ matrix.config.CUDA_VER }}, ${{ matrix.config.PY_VER }}, windows
     needs:
       - compute-matrix
     strategy:
       fail-fast: false
       matrix:
-        - {"PY_VER": "3.10", "CUDA_VER": "12.8.0"}
-        - {"PY_VER": "3.11", "CUDA_VER": "12.8.0"}
-        - {"PY_VER": "3.12", "CUDA_VER": "12.8.0"}
-        - {"PY_VER": "3.13", "CUDA_VER": "12.8.0"}
+        config:
+          - {"PY_VER": "3.10", "CUDA_VER": "12.8.0"}
+          - {"PY_VER": "3.11", "CUDA_VER": "12.8.0"}
+          - {"PY_VER": "3.12", "CUDA_VER": "12.8.0"}
+          - {"PY_VER": "3.13", "CUDA_VER": "12.8.0"}
     runs-on: windows-2022
     steps:
       - uses: actions/checkout@v4
@@ -55,10 +56,10 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
 
-      - name: Set up Python ${{ matrix.PY_VER }}
+      - name: Set up Python ${{ matrix.config.PY_VER }}
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
         with:
-          python-version: ${{ matrix.PY_VER }}
+          python-version: ${{ matrix.config.PY_VER }}
 
       - name: Set up MSVC
         uses: ilammy/msvc-dev-cmd@v1  # TODO: ask admin to allow pinning commits
@@ -67,11 +68,11 @@ jobs:
         run: |
           ${{ inputs.script }}
         env:
-          CUDA_VER: ${{ matrix.CUDA_VER }}
+          CUDA_VER: ${{ matrix.config.CUDA_VER }}
 
       - name: Upload wheel
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: wheel-py${{ matrix.PY_VER }}-windows
+          name: wheel-py${{ matrix.config.PY_VER }}-windows
           path: ${{ env.wheel_path }}

From 9157012b04534c126fe6f0c144a7ab4703fbc80b Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 10:53:43 +0100
Subject: [PATCH 55/62] Attempt to simplify Windows matrix setup

---
 .github/workflows/pr.yaml                  | 12 ------
 .github/workflows/wheel-windows-build.yaml | 43 +++++-----------------
 .github/workflows/wheel-windows-tests.yaml | 34 ++++-------------
 ci/matrix.yml                              |  5 ---
 4 files changed, 18 insertions(+), 76 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1c6df6a3d..b2b5a2a6a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -41,7 +41,6 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }}
-      WINDOWS_MATRIX: ${{ steps.compute-matrix.outputs.WINDOWS_MATRIX }}
       SIMULATOR_MATRIX: ${{ steps.compute-matrix.outputs.SIMULATOR_MATRIX }}
       TEST_MATRIX: ${{ steps.compute-matrix.outputs.TEST_MATRIX }}
     steps:
@@ -50,16 +49,12 @@ jobs:
         id: compute-matrix
         run: |
           BUILD_MATRIX="$(yq '.build-matrix' ci/matrix.yml)"
-          WINDOWS_MATRIX="$(yq '.windows-matrix' ci/matrix.yml)"
           SIMULATOR_MATRIX="$(yq '.simulator-matrix' ci/matrix.yml)"
           TEST_MATRIX="$(yq '.test-matrix' ci/matrix.yml)"
           {
             echo 'BUILD_MATRIX<<EOF'
             echo "${BUILD_MATRIX}"
             echo 'EOF'
-            echo 'WINDOWS_MATRIX<<EOF'
-            echo "${WINDOWS_MATRIX}"
-            echo 'EOF'
             echo 'SIMULATOR_MATRIX<<EOF'
             echo "${SIMULATOR_MATRIX}"
             echo 'EOF'
@@ -115,18 +110,11 @@ jobs:
       script: "ci/build_wheel.sh"
       matrix: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
   build-wheels-windows:
-    needs:
-      - compute-matrix
     uses: ./.github/workflows/wheel-windows-build.yaml
-    with:
-      matrix: ${{ needs.compute-matrix.outputs.WINDOWS_MATRIX }}
   test-wheels-windows:
     needs:
       - build-wheels-windows
-      - compute-matrix
     uses: ./.github/workflows/wheel-windows-tests.yaml
-    with:
-      matrix: ${{ needs.compute-matrix.outputs.WINDOWS_MATRIX }}
   test-wheels:
     needs:
       - build-wheels
diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
index 42ac8c702..6a861db5c 100644
--- a/.github/workflows/wheel-windows-build.yaml
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -14,40 +14,17 @@ on:
         type: string
         default: "./ci/build_wheel.ps1"
 
-      # general settings
-      matrix:
-        type: string
-        required: true
-      matrix_filter:
-        type: string
-        default: "."
-
 jobs:
-  compute-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
-    steps:
-      - name: Compute Build Matrix
-        id: compute-matrix
-        env:
-          MATRIX: ${{ inputs.matrix }}
-        run: |
-          set -eo pipefail
-          MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c)
-          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   build:
-    name: ${{ matrix.config.CUDA_VER }}, ${{ matrix.config.PY_VER }}, windows
-    needs:
-      - compute-matrix
+    name: ${{ matrix.PY_VER }}, windows
     strategy:
       fail-fast: false
       matrix:
-        config:
-          - {"PY_VER": "3.10", "CUDA_VER": "12.8.0"}
-          - {"PY_VER": "3.11", "CUDA_VER": "12.8.0"}
-          - {"PY_VER": "3.12", "CUDA_VER": "12.8.0"}
-          - {"PY_VER": "3.13", "CUDA_VER": "12.8.0"}
+        PY_VER:
+          - "3.10"
+          - "3.11"
+          - "3.12"
+          - "3.13"
     runs-on: windows-2022
     steps:
       - uses: actions/checkout@v4
@@ -56,10 +33,10 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
 
-      - name: Set up Python ${{ matrix.config.PY_VER }}
+      - name: Set up Python ${{ matrix.PY_VER }}
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
         with:
-          python-version: ${{ matrix.config.PY_VER }}
+          python-version: ${{ matrix.PY_VER }}
 
       - name: Set up MSVC
         uses: ilammy/msvc-dev-cmd@v1  # TODO: ask admin to allow pinning commits
@@ -68,11 +45,11 @@ jobs:
         run: |
           ${{ inputs.script }}
         env:
-          CUDA_VER: ${{ matrix.config.CUDA_VER }}
+          CUDA_VER: "12.8.0"
 
       - name: Upload wheel
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: wheel-py${{ matrix.config.PY_VER }}-windows
+          name: wheel-py${{ matrix.PY_VER }}-windows
           path: ${{ env.wheel_path }}
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index ca37c1059..766a2f9a5 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -14,35 +14,17 @@ on:
         type: string
         default: "./ci/test_wheel.ps1"
 
-      # general settings
-      matrix:
-        type: string
-        required: true
-      matrix_filter:
-        type: string
-        default: "."
-
 jobs:
-  compute-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
-    steps:
-      - name: Compute Python Test Matrix
-        id: compute-matrix
-        env:
-          MATRIX: ${{ inputs.matrix }}
-        run: |
-          set -eo pipefail
-          MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c)
-          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   tests:
-    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows
-    needs:
-      - compute-matrix
+    name: ${{ matrix.PY_VER }}, windows
     strategy:
       fail-fast: false
-      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+      matrix:
+        PY_VER:
+          - "3.10"
+          - "3.11"
+          - "3.12"
+          - "3.13"
     runs-on: "cuda-python-windows-gpu-github"
     steps:
       - uses: actions/checkout@v4
@@ -75,4 +57,4 @@ jobs:
         run: |
           ${{ inputs.script }}
         env:
-          CUDA_VER: ${{ matrix.CUDA_VER }}
+          CUDA_VER: "12.8.0"
diff --git a/ci/matrix.yml b/ci/matrix.yml
index a844fa1ed..f810738c1 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -13,11 +13,6 @@ build-matrix:
   - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
-windows-matrix:
-  - { PY_VER: '3.10', CUDA_VER: '12.8.0' }
-  - { PY_VER: '3.11', CUDA_VER: '12.8.0' }
-  - { PY_VER: '3.12', CUDA_VER: '12.8.0' }
-  - { PY_VER: '3.13', CUDA_VER: '12.8.0' }
 simulator-matrix:
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 # We test "oldest" dependencies with the oldest supported Python version and

From d1cf984ce42a9a96ed47074bbcc78dcadae2422f Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 12:16:54 +0100
Subject: [PATCH 56/62] Remove accidental duplicate install of numba-cuda in
 docs build

---
 ci/build_docs.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index b4a791446..f5739b556 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -29,8 +29,6 @@ set +u
 conda activate docs
 set -u
 
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
-
 pip install nvidia-sphinx-theme
 
 rapids-print-env

From 300d90c37fe77edd7d2193a5a9366abe768b17c8 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 12:19:57 +0100
Subject: [PATCH 57/62] Remove some duplicate installation steps

---
 ci/test_conda.sh                | 16 ----------------
 ci/test_conda_ctypes_binding.sh |  2 --
 ci/test_simulator.sh            |  2 --
 3 files changed, 20 deletions(-)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 47fb3d01f..ed2d57cef 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 set -euo pipefail
-set -x
 
 . /opt/conda/etc/profile.d/conda.sh
 
@@ -59,19 +58,6 @@ set -u
 
 pip install filecheck
 
-# Detect system architecture to set conda repo path
-ARCH=$(uname -m)
-if [[ "$ARCH" == "x86_64" ]]; then
-    ARCH_SUFFIX="amd64"
-elif [[ "$ARCH" == "aarch64" ]]; then
-    ARCH_SUFFIX="arm64"
-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
-
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
-
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 pushd "${RAPIDS_TESTS_DIR}"
@@ -99,13 +85,11 @@ print(test_dir)
 "
 
 rapids-logger "Build tests"
-
 export NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$GET_TEST_BINARY_DIR")
 pushd $NUMBA_CUDA_TEST_BIN_DIR
 make
 popd
 
-
 rapids-logger "Run Tests"
 pytest --pyargs numba.cuda.tests -v
 
diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh
index 5faaddedb..844b35b40 100755
--- a/ci/test_conda_ctypes_binding.sh
+++ b/ci/test_conda_ctypes_binding.sh
@@ -44,8 +44,6 @@ set -u
 
 pip install filecheck
 
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
-
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 pushd "${RAPIDS_TESTS_DIR}"
diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh
index c11e2f5b7..bb85a8733 100755
--- a/ci/test_simulator.sh
+++ b/ci/test_simulator.sh
@@ -31,8 +31,6 @@ set -u
 
 pip install filecheck
 
-rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda
-
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 pushd "${RAPIDS_TESTS_DIR}"

From d27dd77a9961ee710b39731b4b7eb5103a081fe4 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 12:20:24 +0100
Subject: [PATCH 58/62] Remove unnecessary NumPy version pins

---
 conda/recipes/numba-cuda/meta.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index 8016ca6d3..b03309822 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -25,8 +25,7 @@ requirements:
     - python
     - pip
     - setuptools
-    - numpy >=2.1.0  # [py >= 310]
-    - numpy ==2.0.2  # [py == 39]
+    - numpy >=2.1.0
   run:
     - python
     - numba >=0.59.1

From aa4ecc3bf8da2f0947119d9af33ef2427a9d5508 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Sep 2025 12:26:53 +0100
Subject: [PATCH 59/62] Remove debugging code

---
 numba_cuda/numba/cuda/testing.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py
index f3700593c..373a66476 100644
--- a/numba_cuda/numba/cuda/testing.py
+++ b/numba_cuda/numba/cuda/testing.py
@@ -336,9 +336,3 @@ class ForeignArray(object):
     def __init__(self, arr):
         self._arr = arr
         self.__cuda_array_interface__ = arr.__cuda_array_interface__
-
-
-def test_binary_generation_dir():
-    testing = Path(__file__)
-    binary_gen_dir = testing.parent / "tests" / "test_binary_generation"
-    return binary_gen_dir

From bbe7b3cd51d0baf1143312cf301d76c6ef5882ac Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Fri, 5 Sep 2025 10:43:33 -0500
Subject: [PATCH 60/62] Minor changes to address review feedback

Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 numba_cuda/numba/cuda/cext/_dispatcher.cpp | 5 +++++
 numba_cuda/numba/cuda/cext/_pymodule.h     | 2 +-
 numba_cuda/numba/cuda/dispatcher.py        | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
index 4f5d20b6c..bfd3c6518 100644
--- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp
+++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
@@ -43,6 +43,11 @@
 #endif
 #undef _PyGC_FINALIZED
 
+/* dynamic_annotations.h is needed for building Python with --with-valgrind 
+ * support. The following include is to workaround issues described in
+ * https://github.com/numba/numba/pull/10073
+ */
+#include "dynamic_annotations.h"
 #if (PY_MINOR_VERSION == 12)
     #include "internal/pycore_atomic.h"
 #endif
diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h
index cff21ebe7..4431dfff2 100644
--- a/numba_cuda/numba/cuda/cext/_pymodule.h
+++ b/numba_cuda/numba/cuda/cext/_pymodule.h
@@ -33,6 +33,6 @@
         Py_DECREF(tmp); } while (0)
 
 
-#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 9) || (PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
 
 #endif /* NUMBA_PY_MODULE_H_ */
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index be2674d67..ad3b57a85 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -1012,7 +1012,7 @@ def error_rewrite(e, issue_type):
                 val = arg.value if isinstance(arg, OmittedArg) else arg
                 try:
                     tp = typeof(val, Purpose.argument)
-                except ValueError as typeof_exc:
+                except (errors.NumbaValueError, ValueError) as typeof_exc:
                     failed_args.append((i, str(typeof_exc)))
                 else:
                     if tp is None:
@@ -1306,7 +1306,7 @@ def typeof_pyval(self, val):
         """
         try:
             tp = typeof(val, Purpose.argument)
-        except ValueError:
+        except (errors.NumbaValueError, ValueError):
             tp = types.pyobject
         else:
             if tp is None:

From 8acf0dcdc50a5d98e0753cd447584a8927058d61 Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Thu, 25 Sep 2025 14:43:21 -0700
Subject: [PATCH 61/62] Fix formatting in _dispatcher.cpp

---
 numba_cuda/numba/cuda/cext/_dispatcher.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
index bfd3c6518..9b0b8f0dc 100644
--- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp
+++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
@@ -43,7 +43,7 @@
 #endif
 #undef _PyGC_FINALIZED
 
-/* dynamic_annotations.h is needed for building Python with --with-valgrind 
+/* dynamic_annotations.h is needed for building Python with --with-valgrind
  * support. The following include is to workaround issues described in
  * https://github.com/numba/numba/pull/10073
  */

From 3db9ae85f090c791c924f24821ed1bf641fd096a Mon Sep 17 00:00:00 2001
From: Vijay Kandiah <vkandiah@nvidia.com>
Date: Wed, 1 Oct 2025 20:58:31 -0700
Subject: [PATCH 62/62] Add warning about symbol conflicts if RTLD_GLOBAL is
 set for linux

---
 numba_cuda/numba/cuda/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index 2c82b70d8..db14c30c9 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -5,6 +5,7 @@
 from numba.cuda.core import config
 from .utils import _readenv
 import warnings
+import sys
 
 
 # Enable pynvjitlink based on the following precedence:
@@ -96,3 +97,13 @@
 
 if numba_cuda_default_ptx_cc > config_default_cc:
     config.CUDA_DEFAULT_PTX_CC = numba_cuda_default_ptx_cc
+
+
+# Warn if on Linux and RTLD_GLOBAL is enabled
+if sys.platform.startswith("linux") and (sys.getdlopenflags() & 0x100) != 0:
+    warnings.warn(
+        "RTLD_GLOBAL is enabled, which might result in symbol resolution "
+        "conflicts when importing both numba and numba.cuda. Consider using "
+        "sys.setdlopenflags() to disable RTLD_GLOBAL "
+        "if you encounter symbol conflicts."
+    )