From 1ae469cfcde904bf2c85c430f17db538b1fd192c Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Mon, 4 Aug 2025 08:55:21 -0700 Subject: [PATCH 01/62] [Refactor][NFC] Vendor in _DispatcherBase for CUDA-specific changes --- numba_cuda/numba/cuda/dispatcher.py | 608 +++++++++++++++++++++++++++- 1 file changed, 602 insertions(+), 6 deletions(-) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 9f69955b7..68706bfd1 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -8,16 +8,16 @@ import weakref import uuid -from numba.core import compiler, types, typing, config +from numba.core import compiler, types, typing, config, errors, entrypoints from numba.cuda import serialize, utils from numba.cuda.core.caching import Cache, CacheImpl, NullCache from numba.core.compiler_lock import global_compiler_lock -from numba.core.dispatcher import _DispatcherBase -from numba.core.errors import NumbaPerformanceWarning, TypingError +from numba.core.typeconv.rules import default_type_manager from numba.cuda.typing.templates import fold_arguments from numba.core.typing.typeof import Purpose, typeof from numba.cuda.api import get_current_device from numba.cuda.args import wrap_arg +from numba.core.bytecode import get_code_object from numba.cuda.compiler import ( compile_cuda, CUDACompiler, @@ -691,7 +691,7 @@ def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem): f"Grid size {grid_size} will likely result in GPU " "under-utilization due to low occupancy." ) - warn(NumbaPerformanceWarning(msg)) + warn(errors.NumbaPerformanceWarning(msg)) def __call__(self, *args): return self.dispatcher.call( @@ -734,6 +734,602 @@ def load_overload(self, sig, target_context): return super().load_overload(sig, target_context) +class OmittedArg(object): + """ + A placeholder for omitted arguments with a default value. + """ + + def __init__(self, value): + self.value = value + + def __repr__(self): + return "omitted arg(%r)" % (self.value,) + + @property + def _numba_type_(self): + return types.Omitted(self.value) + + +class CompilingCounter(object): + """ + A simple counter that increment in __enter__ and decrement in __exit__. + """ + + def __init__(self): + self.counter = 0 + + def __enter__(self): + assert self.counter >= 0 + self.counter += 1 + + def __exit__(self, *args, **kwargs): + self.counter -= 1 + assert self.counter >= 0 + + def __bool__(self): + return self.counter > 0 + + __nonzero__ = __bool__ + + +class _DispatcherBase(_dispatcher.Dispatcher): + """ + Common base class for dispatcher Implementations. + """ + + __numba__ = "py_func" + + def __init__( + self, arg_count, py_func, pysig, can_fallback, exact_match_required + ): + self._tm = default_type_manager + + # A mapping of signatures to compile results + self.overloads = collections.OrderedDict() + + self.py_func = py_func + # other parts of Numba assume the old Python 2 name for code object + self.func_code = get_code_object(py_func) + # but newer python uses a different name + self.__code__ = self.func_code + # a place to keep an active reference to the types of the active call + self._types_active_call = set() + # Default argument values match the py_func + self.__defaults__ = py_func.__defaults__ + + argnames = tuple(pysig.parameters) + default_values = self.py_func.__defaults__ or () + defargs = tuple(OmittedArg(val) for val in default_values) + try: + lastarg = list(pysig.parameters.values())[-1] + except IndexError: + has_stararg = False + else: + has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL + _dispatcher.Dispatcher.__init__( + self, + self._tm.get_pointer(), + arg_count, + self._fold_args, + argnames, + defargs, + can_fallback, + has_stararg, + exact_match_required, + ) + + self.doc = py_func.__doc__ + self._compiling_counter = CompilingCounter() + self._enable_sysmon = bool(config.ENABLE_SYS_MONITORING) + weakref.finalize(self, self._make_finalizer()) + + def _compilation_chain_init_hook(self): + """ + This will be called ahead of any part of compilation taking place (this + even includes being ahead of working out the types of the arguments). + This permits activities such as initialising extension entry points so + that the compiler knows about additional externally defined types etc + before it does anything. + """ + entrypoints.init_all() + + def _reset_overloads(self): + self._clear() + self.overloads.clear() + + def _make_finalizer(self): + """ + Return a finalizer function that will release references to + related compiled functions. + """ + overloads = self.overloads + targetctx = self.targetctx + + # Early-bind utils.shutting_down() into the function's local namespace + # (see issue #689) + def finalizer(shutting_down=utils.shutting_down): + # The finalizer may crash at shutdown, skip it (resources + # will be cleared by the process exiting, anyway). + if shutting_down(): + return + # This function must *not* hold any reference to self: + # we take care to bind the necessary objects in the closure. + for cres in overloads.values(): + try: + targetctx.remove_user_function(cres.entry_point) + except KeyError: + pass + + return finalizer + + @property + def signatures(self): + """ + Returns a list of compiled function signatures. + """ + return list(self.overloads) + + @property + def nopython_signatures(self): + return [ + cres.signature + for cres in self.overloads.values() + if not cres.objectmode + ] + + def disable_compile(self, val=True): + """Disable the compilation of new signatures at call time.""" + # If disabling compilation then there must be at least one signature + assert (not val) or len(self.signatures) > 0 + self._can_compile = not val + + def add_overload(self, cres): + args = tuple(cres.signature.args) + sig = [a._code for a in args] + self._insert(sig, cres.entry_point, cres.objectmode) + self.overloads[args] = cres + + def fold_argument_types(self, args, kws): + return self._compiler.fold_argument_types(args, kws) + + def get_call_template(self, args, kws): + """ + Get a typing.ConcreteTemplate for this dispatcher and the given + *args* and *kws* types. This allows to resolve the return type. + + A (template, pysig, args, kws) tuple is returned. + """ + # XXX how about a dispatcher template class automating the + # following? + + # Fold keyword arguments and resolve default values + pysig, args = self._compiler.fold_argument_types(args, kws) + kws = {} + # Ensure an overload is available + if self._can_compile: + self.compile(tuple(args)) + + # Create function type for typing + func_name = self.py_func.__name__ + name = "CallTemplate({0})".format(func_name) + # The `key` isn't really used except for diagnosis here, + # so avoid keeping a reference to `cfunc`. + call_template = typing.make_concrete_template( + name, key=func_name, signatures=self.nopython_signatures + ) + return call_template, pysig, args, kws + + def get_overload(self, sig): + """ + Return the compiled function for the given signature. + """ + args, return_type = sigutils.normalize_signature(sig) + return self.overloads[tuple(args)].entry_point + + @property + def is_compiling(self): + """ + Whether a specialization is currently being compiled. + """ + return self._compiling_counter + + def _compile_for_args(self, *args, **kws): + """ + For internal use. Compile a specialized version of the function + for the given *args* and *kws*, and return the resulting callable. + """ + assert not kws + # call any initialisation required for the compilation chain (e.g. + # extension point registration). + self._compilation_chain_init_hook() + + def error_rewrite(e, issue_type): + """ + Rewrite and raise Exception `e` with help supplied based on the + specified issue_type. + """ + if config.SHOW_HELP: + help_msg = errors.error_extras[issue_type] + e.patch_message("\n".join((str(e).rstrip(), help_msg))) + if config.FULL_TRACEBACKS: + raise e + else: + raise e.with_traceback(None) + + argtypes = [] + for a in args: + if isinstance(a, OmittedArg): + argtypes.append(types.Omitted(a.value)) + else: + argtypes.append(self.typeof_pyval(a)) + + return_val = None + try: + return_val = self.compile(tuple(argtypes)) + except errors.ForceLiteralArg as e: + # Received request for compiler re-entry with the list of arguments + # indicated by e.requested_args. + # First, check if any of these args are already Literal-ized + already_lit_pos = [ + i + for i in e.requested_args + if isinstance(args[i], types.Literal) + ] + if already_lit_pos: + # Abort compilation if any argument is already a Literal. + # Letting this continue will cause infinite compilation loop. + m = ( + "Repeated literal typing request.\n" + "{}.\n" + "This is likely caused by an error in typing. " + "Please see nested and suppressed exceptions." + ) + info = ", ".join( + "Arg #{} is {}".format(i, args[i]) + for i in sorted(already_lit_pos) + ) + raise errors.CompilerError(m.format(info)) + # Convert requested arguments into a Literal. + args = [ + (types.literal if i in e.requested_args else lambda x: x)( + args[i] + ) + for i, v in enumerate(args) + ] + # Re-enter compilation with the Literal-ized arguments + return_val = self._compile_for_args(*args) + + except errors.TypingError as e: + # Intercept typing error that may be due to an argument + # that failed inferencing as a Numba type + failed_args = [] + for i, arg in enumerate(args): + val = arg.value if isinstance(arg, OmittedArg) else arg + try: + tp = typeof(val, Purpose.argument) + except ValueError as typeof_exc: + failed_args.append((i, str(typeof_exc))) + else: + if tp is None: + failed_args.append( + (i, f"cannot determine Numba type of value {val}") + ) + if failed_args: + # Patch error message to ease debugging + args_str = "\n".join( + f"- argument {i}: {err}" for i, err in failed_args + ) + msg = ( + f"{str(e).rstrip()} \n\nThis error may have been caused " + f"by the following argument(s):\n{args_str}\n" + ) + e.patch_message(msg) + + error_rewrite(e, "typing") + except errors.UnsupportedError as e: + # Something unsupported is present in the user code, add help info + error_rewrite(e, "unsupported_error") + except ( + errors.NotDefinedError, + errors.RedefinedError, + errors.VerificationError, + ) as e: + # These errors are probably from an issue with either the code + # supplied being syntactically or otherwise invalid + error_rewrite(e, "interpreter") + except errors.ConstantInferenceError as e: + # this is from trying to infer something as constant when it isn't + # or isn't supported as a constant + error_rewrite(e, "constant_inference") + except Exception as e: + if config.SHOW_HELP: + if hasattr(e, "patch_message"): + help_msg = errors.error_extras["reportable"] + e.patch_message("\n".join((str(e).rstrip(), help_msg))) + # ignore the FULL_TRACEBACKS config, this needs reporting! + raise e + finally: + self._types_active_call.clear() + return return_val + + def inspect_llvm(self, signature=None): + """Get the LLVM intermediate representation generated by compilation. + + Parameters + ---------- + signature : tuple of numba types, optional + Specify a signature for which to obtain the LLVM IR. If None, the + IR is returned for all available signatures. + + Returns + ------- + llvm : dict[signature, str] or str + Either the LLVM IR string for the specified signature, or, if no + signature was given, a dictionary mapping signatures to LLVM IR + strings. + """ + if signature is not None: + lib = self.overloads[signature].library + return lib.get_llvm_str() + + return dict((sig, self.inspect_llvm(sig)) for sig in self.signatures) + + def inspect_asm(self, signature=None): + """Get the generated assembly code. + + Parameters + ---------- + signature : tuple of numba types, optional + Specify a signature for which to obtain the assembly code. If + None, the assembly code is returned for all available signatures. + + Returns + ------- + asm : dict[signature, str] or str + Either the assembly code for the specified signature, or, if no + signature was given, a dictionary mapping signatures to assembly + code. + """ + if signature is not None: + lib = self.overloads[signature].library + return lib.get_asm_str() + + return dict((sig, self.inspect_asm(sig)) for sig in self.signatures) + + def inspect_types( + self, file=None, signature=None, pretty=False, style="default", **kwargs + ): + """Print/return Numba intermediate representation (IR)-annotated code. + + Parameters + ---------- + file : file-like object, optional + File to which to print. Defaults to sys.stdout if None. Must be + None if ``pretty=True``. + signature : tuple of numba types, optional + Print/return the intermediate representation for only the given + signature. If None, the IR is printed for all available signatures. + pretty : bool, optional + If True, an Annotate object will be returned that can render the + IR with color highlighting in Jupyter and IPython. ``file`` must + be None if ``pretty`` is True. Additionally, the ``pygments`` + library must be installed for ``pretty=True``. + style : str, optional + Choose a style for rendering. Ignored if ``pretty`` is ``False``. + This is directly consumed by ``pygments`` formatters. To see a + list of available styles, import ``pygments`` and run + ``list(pygments.styles.get_all_styles())``. + + Returns + ------- + annotated : Annotate object, optional + Only returned if ``pretty=True``, otherwise this function is only + used for its printing side effect. If ``pretty=True``, an Annotate + object is returned that can render itself in Jupyter and IPython. + """ + overloads = self.overloads + if signature is not None: + overloads = {signature: self.overloads[signature]} + + if not pretty: + if file is None: + file = sys.stdout + + for ver, res in overloads.items(): + print("%s %s" % (self.py_func.__name__, ver), file=file) + print("-" * 80, file=file) + print(res.type_annotation, file=file) + print("=" * 80, file=file) + else: + if file is not None: + raise ValueError("`file` must be None if `pretty=True`") + from numba.core.annotations.pretty_annotate import Annotate + + return Annotate(self, signature=signature, style=style) + + def inspect_cfg(self, signature=None, show_wrapper=None, **kwargs): + """ + For inspecting the CFG of the function. + + By default the CFG of the user function is shown. The *show_wrapper* + option can be set to "python" or "cfunc" to show the python wrapper + function or the *cfunc* wrapper function, respectively. + + Parameters accepted in kwargs + ----------------------------- + filename : string, optional + the name of the output file, if given this will write the output to + filename + view : bool, optional + whether to immediately view the optional output file + highlight : bool, set, dict, optional + what, if anything, to highlight, options are: + { incref : bool, # highlight NRT_incref calls + decref : bool, # highlight NRT_decref calls + returns : bool, # highlight exits which are normal returns + raises : bool, # highlight exits which are from raise + meminfo : bool, # highlight calls to NRT*meminfo + branches : bool, # highlight true/false branches + } + Default is True which sets all of the above to True. Supplying a set + of strings is also accepted, these are interpreted as key:True with + respect to the above dictionary. e.g. {'incref', 'decref'} would + switch on highlighting on increfs and decrefs. + interleave: bool, set, dict, optional + what, if anything, to interleave in the LLVM IR, options are: + { python: bool # interleave python source code with the LLVM IR + lineinfo: bool # interleave line information markers with the LLVM + # IR + } + Default is True which sets all of the above to True. Supplying a set + of strings is also accepted, these are interpreted as key:True with + respect to the above dictionary. e.g. {'python',} would + switch on interleaving of python source code in the LLVM IR. + strip_ir : bool, optional + Default is False. If set to True all LLVM IR that is superfluous to + that requested in kwarg `highlight` will be removed. + show_key : bool, optional + Default is True. Create a "key" for the highlighting in the rendered + CFG. + fontsize : int, optional + Default is 8. Set the fontsize in the output to this value. + """ + if signature is not None: + cres = self.overloads[signature] + lib = cres.library + if show_wrapper == "python": + fname = cres.fndesc.llvm_cpython_wrapper_name + elif show_wrapper == "cfunc": + fname = cres.fndesc.llvm_cfunc_wrapper_name + else: + fname = cres.fndesc.mangled_name + return lib.get_function_cfg(fname, py_func=self.py_func, **kwargs) + + return dict( + (sig, self.inspect_cfg(sig, show_wrapper=show_wrapper)) + for sig in self.signatures + ) + + def inspect_disasm_cfg(self, signature=None): + """ + For inspecting the CFG of the disassembly of the function. + + Requires python package: r2pipe + Requires radare2 binary on $PATH. + Notebook rendering requires python package: graphviz + + signature : tuple of Numba types, optional + Print/return the disassembly CFG for only the given signatures. + If None, the IR is printed for all available signatures. + """ + if signature is not None: + cres = self.overloads[signature] + lib = cres.library + return lib.get_disasm_cfg(cres.fndesc.mangled_name) + + return dict( + (sig, self.inspect_disasm_cfg(sig)) for sig in self.signatures + ) + + def get_annotation_info(self, signature=None): + """ + Gets the annotation information for the function specified by + signature. If no signature is supplied a dictionary of signature to + annotation information is returned. + """ + signatures = self.signatures if signature is None else [signature] + out = collections.OrderedDict() + for sig in signatures: + cres = self.overloads[sig] + ta = cres.type_annotation + key = ( + ta.func_id.filename + ":" + str(ta.func_id.firstlineno + 1), + ta.signature, + ) + out[key] = ta.annotate_raw()[key] + return out + + def _explain_ambiguous(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + """ + assert not kws, "kwargs not handled" + args = tuple([self.typeof_pyval(a) for a in args]) + # The order here must be deterministic for testing purposes, which + # is ensured by the OrderedDict. + sigs = self.nopython_signatures + # This will raise + self.typingctx.resolve_overload( + self.py_func, sigs, args, kws, allow_ambiguous=False + ) + + def _explain_matching_error(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + """ + assert not kws, "kwargs not handled" + args = [self.typeof_pyval(a) for a in args] + msg = "No matching definition for argument type(s) %s" % ", ".join( + map(str, args) + ) + raise TypeError(msg) + + def _search_new_conversions(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + Search for approximately matching signatures for the given arguments, + and ensure the corresponding conversions are registered in the C++ + type manager. + """ + assert not kws, "kwargs not handled" + args = [self.typeof_pyval(a) for a in args] + found = False + for sig in self.nopython_signatures: + conv = self.typingctx.install_possible_conversions(args, sig.args) + if conv: + found = True + return found + + def __repr__(self): + return "%s(%s)" % (type(self).__name__, self.py_func) + + def typeof_pyval(self, val): + """ + Resolve the Numba type of Python value *val*. + This is called from numba._dispatcher as a fallback if the native code + cannot decide the type. + """ + try: + tp = typeof(val, Purpose.argument) + except ValueError: + tp = types.pyobject + else: + if tp is None: + tp = types.pyobject + self._types_active_call.add(tp) + return tp + + def _callback_add_timer(self, duration, cres, lock_name): + md = cres.metadata + # md can be None when code is loaded from cache + if md is not None: + timers = md.setdefault("timers", {}) + if lock_name not in timers: + # Only write if the metadata does not exist + timers[lock_name] = duration + else: + msg = f"'{lock_name} metadata is already defined." + raise AssertionError(msg) + + def _callback_add_compiler_timer(self, duration, cres): + return self._callback_add_timer( + duration, cres, lock_name="compiler_lock" + ) + + def _callback_add_llvm_timer(self, duration, cres): + return self._callback_add_timer(duration, cres, lock_name="llvm_lock") + + class _MemoMixin: __uuid = None # A {uuid -> instance} mapping, for deserialization @@ -824,7 +1420,7 @@ def _compile_cached(self, args, return_type): try: retval = self._compile_core(args, return_type) - except TypingError as e: + except errors.TypingError as e: self._failed_cache[key] = e return False, e else: @@ -1312,7 +1908,7 @@ def get_compile_result(self, sig): self.compile(atypes) else: msg = f"{sig} not available and compilation disabled" - raise TypingError(msg) + raise errors.TypingError(msg) return self.overloads[atypes] def recompile(self): From bde712847bb9bf974779fda06f16a9bdc49f556f Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Mon, 4 Aug 2025 21:27:13 -0700 Subject: [PATCH 02/62] [Refactor][WIP] Vendoring in dispatcher cext for CUDA-specific changes --- numba_cuda/numba/cuda/__init__.py | 1 + .../numba/cuda/_dispatcher/_devicearray.h | 25 + .../numba/cuda/_dispatcher/_dispatcher.cpp | 1667 +++++++++++++++++ .../numba/cuda/_dispatcher/_hashtable.cpp | 529 ++++++ .../numba/cuda/_dispatcher/_hashtable.h | 132 ++ .../numba/cuda/_dispatcher/_numba_common.h | 43 + numba_cuda/numba/cuda/_dispatcher/_pymodule.h | 35 + numba_cuda/numba/cuda/_dispatcher/_typeof.cpp | 1170 ++++++++++++ numba_cuda/numba/cuda/_dispatcher/_typeof.h | 16 + .../numba/cuda/_dispatcher/typeconv.cpp | 209 +++ .../numba/cuda/_dispatcher/typeconv.hpp | 98 + numba_cuda/numba/cuda/dispatcher.py | 2 +- pyproject.toml | 1 + setup.py | 141 +- 14 files changed, 4063 insertions(+), 6 deletions(-) create mode 100644 numba_cuda/numba/cuda/_dispatcher/_devicearray.h create mode 100644 numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp create mode 100644 numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp create mode 100644 numba_cuda/numba/cuda/_dispatcher/_hashtable.h create mode 100644 numba_cuda/numba/cuda/_dispatcher/_numba_common.h create mode 100644 numba_cuda/numba/cuda/_dispatcher/_pymodule.h create mode 100644 numba_cuda/numba/cuda/_dispatcher/_typeof.cpp create mode 100644 numba_cuda/numba/cuda/_dispatcher/_typeof.h create mode 100644 numba_cuda/numba/cuda/_dispatcher/typeconv.cpp create mode 100644 numba_cuda/numba/cuda/_dispatcher/typeconv.hpp diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py index 0465e1903..607ef2e09 100644 --- a/numba_cuda/numba/cuda/__init__.py +++ b/numba_cuda/numba/cuda/__init__.py @@ -1,5 +1,6 @@ import importlib from numba.core import config +from numba.core.config import ENABLE_SYS_MONITORING from .utils import _readenv import warnings diff --git a/numba_cuda/numba/cuda/_dispatcher/_devicearray.h b/numba_cuda/numba/cuda/_dispatcher/_devicearray.h new file mode 100644 index 000000000..5b276eacf --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_devicearray.h @@ -0,0 +1,25 @@ +#ifndef NUMBA_DEVICEARRAY_H_ +#define NUMBA_DEVICEARRAY_H_ + +#ifdef __cplusplus + extern "C" { +#endif + +/* These definitions should only be used by consumers of the Device Array API. + * Consumers access the API through the opaque pointer stored in + * _devicearray._DEVICEARRAY_API. We don't want these definitions in + * _devicearray.cpp itself because they would conflict with the actual + * implementations there. + */ +#ifndef NUMBA_IN_DEVICEARRAY_CPP_ + + extern void **DeviceArray_API; + #define DeviceArrayType (*(PyTypeObject*)DeviceArray_API[0]) + +#endif /* ndef NUMBA_IN_DEVICEARRAY_CPP */ + +#ifdef __cplusplus + } +#endif + +#endif /* NUMBA_DEVICEARRAY_H_ */ diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp new file mode 100644 index 000000000..5afdb2880 --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp @@ -0,0 +1,1667 @@ +#include "_pymodule.h" + +#include +#include +#include +#include + +#include "_typeof.h" +#include "frameobject.h" +#include "traceback.h" +#include "typeconv.hpp" +#include "_devicearray.h" + +/* + * Notes on the C_TRACE macro: + * + * The original C_TRACE macro (from ceval.c) would call + * PyTrace_C_CALL et al., for which the frame argument wouldn't + * be usable. Since we explicitly synthesize a frame using the + * original Python code object, we call PyTrace_CALL instead so + * the profiler can report the correct source location. + * + * Likewise, while ceval.c would call PyTrace_C_EXCEPTION in case + * of error, the profiler would simply expect a RETURN in case of + * a Python function, so we generate that here (making sure the + * exception state is preserved correctly). + * + */ + +#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) + +#ifndef Py_BUILD_CORE + #define Py_BUILD_CORE 1 +#endif +#include "internal/pycore_frame.h" +// This is a fix suggested in the comments in https://github.com/python/cpython/issues/108216 +// specifically https://github.com/python/cpython/issues/108216#issuecomment-1696565797 +#ifdef HAVE_STD_ATOMIC +# undef HAVE_STD_ATOMIC +#endif +#undef _PyGC_FINALIZED + +#if (PY_MINOR_VERSION == 12) + #include "internal/pycore_atomic.h" +#endif +#include "internal/pycore_interp.h" +#include "internal/pycore_pyerrors.h" +#include "internal/pycore_instruments.h" +#include "internal/pycore_call.h" +#include "cpython/code.h" + +#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11) +#ifndef Py_BUILD_CORE + #define Py_BUILD_CORE 1 +#endif +#include "internal/pycore_frame.h" +#include "internal/pycore_pyerrors.h" + +/* + * Code originally from: + * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L6804 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) { + return 0; + } + if (frame == NULL) { + return -1; + } + int old_what = tstate->tracing_what; + tstate->tracing_what = what; + PyThreadState_EnterTracing(tstate); + result = func(obj, frame, what, NULL); + PyThreadState_LeaveTracing(tstate); + tstate->tracing_what = old_what; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + _PyErr_Fetch(tstate, &type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, what, arg); + if (err == 0) + { + _PyErr_Restore(tstate, type, value, traceback); + return 0; + } + else { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L7245 + * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call, frame) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, frame, \ + PyTrace_CALL, cfunc)) { \ + x = NULL; \ +} \ +else { \ + x = call; \ + if (tstate->c_profilefunc != NULL) { \ + if (x == NULL) { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, frame, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } else { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, frame, \ + PyTrace_RETURN, cfunc)) { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} \ + +#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10 || PY_MINOR_VERSION == 11) + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L36-L40 + */ +typedef struct { + PyCodeObject *code; // The code object for the bounds. May be NULL. + PyCodeAddressRange bounds; // Only valid if code != NULL. + CFrame cframe; +} PyTraceInfo; + + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1257-L1266 + * NOTE: The function is renamed. + */ +static void +_nb_PyLineTable_InitAddressRange(const char *linetable, Py_ssize_t length, int firstlineno, PyCodeAddressRange *range) +{ + range->opaque.lo_next = linetable; + range->opaque.limit = range->opaque.lo_next + length; + range->ar_start = -1; + range->ar_end = 0; + range->opaque.computed_line = firstlineno; + range->ar_line = -1; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1269-L1275 + * NOTE: The function is renamed. + */ +static int +_nb_PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds) +{ + const char *linetable = PyBytes_AS_STRING(co->co_linetable); + Py_ssize_t length = PyBytes_GET_SIZE(co->co_linetable); + _nb_PyLineTable_InitAddressRange(linetable, length, co->co_firstlineno, bounds); + return bounds->ar_line; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5468-L5475 + * NOTE: The call to _PyCode_InitAddressRange is renamed. + */ +static void +initialize_trace_info(PyTraceInfo *trace_info, PyFrameObject *frame) +{ + if (trace_info->code != frame->f_code) { + trace_info->code = frame->f_code; + _nb_PyCode_InitAddressRange(frame->f_code, &trace_info->bounds); + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5477-L5501 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + PyTraceInfo *trace_info, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) + return 0; + tstate->tracing++; + tstate->cframe->use_tracing = 0; + if (frame->f_lasti < 0) { + frame->f_lineno = frame->f_code->co_firstlineno; + } + else { + initialize_trace_info(trace_info, frame); + frame->f_lineno = _PyCode_CheckLineNumber(frame->f_lasti*sizeof(_Py_CODEUNIT), &trace_info->bounds); + } + result = func(obj, frame, what, arg); + frame->f_lineno = 0; + tstate->cframe->use_tracing = ((tstate->c_tracefunc != NULL) + || (tstate->c_profilefunc != NULL)); + tstate->tracing--; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5445-L5466 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + PyTraceInfo *trace_info, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + PyErr_Fetch(&type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, trace_info, what, arg); + if (err == 0) + { + PyErr_Restore(type, value, traceback); + return 0; + } + else + { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5810-L5839 + * NOTE: The state test https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5811 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, tstate->frame, &trace_info, PyTrace_CALL,\ + cfunc)) \ + x = NULL; \ +else \ +{ \ + x = call; \ + if (tstate->c_profilefunc != NULL) \ + { \ + if (x == NULL) \ + { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + &trace_info, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } \ + else \ + { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + &trace_info, \ + PyTrace_RETURN, cfunc)) \ + { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} + +#else // Python <3.10 + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4242-L4257 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) + return 0; + tstate->tracing++; + tstate->use_tracing = 0; + result = func(obj, frame, what, arg); + tstate->use_tracing = ((tstate->c_tracefunc != NULL) + || (tstate->c_profilefunc != NULL)); + tstate->tracing--; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + PyErr_Fetch(&type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, what, arg); + if (err == 0) + { + PyErr_Restore(type, value, traceback); + return 0; + } + else + { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4520-L4549 + * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, tstate->frame, PyTrace_CALL, cfunc)) \ + x = NULL; \ +else \ +{ \ + x = call; \ + if (tstate->c_profilefunc != NULL) \ + { \ + if (x == NULL) \ + { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } \ + else \ + { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + PyTrace_RETURN, cfunc)) \ + { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} + + +#endif + +typedef std::vector TypeTable; +typedef std::vector Functions; + +/* The Dispatcher class is the base class of all dispatchers in the CPU and + CUDA targets. Its main responsibilities are: + + - Resolving the best overload to call for a given set of arguments, and + - Calling the resolved overload. + + This logic is implemented within this class for efficiency (lookup of the + appropriate overload needs to be fast) and ease of implementation (calling + directly into a compiled function using a function pointer is easier within + the C++ code where the overload has been resolved). */ +class Dispatcher { +public: + PyObject_HEAD + /* Whether compilation of new overloads is permitted */ + char can_compile; + /* Enable sys.monitoring (since Python 3.12+) */ + char enable_sysmon; + /* Whether fallback to object mode is permitted */ + char can_fallback; + /* Whether types must match exactly when resolving overloads. + If not, conversions (e.g. float32 -> float64) are permitted when + searching for a match. */ + char exact_match_required; + /* Borrowed reference */ + PyObject *fallbackdef; + /* Whether to fold named arguments and default values + (false for lifted loops) */ + int fold_args; + /* Whether the last positional argument is a stararg */ + int has_stararg; + /* Tuple of argument names */ + PyObject *argnames; + /* Tuple of default values */ + PyObject *defargs; + /* Number of arguments to function */ + int argct; + /* Used for selecting overloaded function implementations */ + TypeManager *tm; + /* An array of overloads */ + Functions functions; + /* A flattened array of argument types to all overloads + * (invariant: sizeof(overloads) == argct * sizeof(functions)) */ + TypeTable overloads; + + /* Add a new overload. Parameters: + + - args: An array of Type objects, one for each parameter + - callable: The callable implementing this overload. */ + void addDefinition(Type args[], PyObject *callable) { + overloads.reserve(argct + overloads.size()); + for (int i=0; iselectOverload(sig, &overloads[0], selected, argct, + ovct, allow_unsafe, + exact_match_required); + } + if (matches == 1) { + return functions[selected]; + } + return NULL; + } + + /* Remove all overloads */ + void clear() { + functions.clear(); + overloads.clear(); + } + +}; + + +static int +Dispatcher_traverse(Dispatcher *self, visitproc visit, void *arg) +{ + Py_VISIT(self->defargs); + return 0; +} + +static void +Dispatcher_dealloc(Dispatcher *self) +{ + Py_XDECREF(self->argnames); + Py_XDECREF(self->defargs); + self->clear(); + Py_TYPE(self)->tp_free((PyObject*)self); +} + + +static int +Dispatcher_init(Dispatcher *self, PyObject *args, PyObject *kwds) +{ + PyObject *tmaddrobj; + void *tmaddr; + int argct; + int can_fallback; + int has_stararg = 0; + int exact_match_required = 0; + + if (!PyArg_ParseTuple(args, "OiiO!O!i|ii", &tmaddrobj, &argct, + &self->fold_args, + &PyTuple_Type, &self->argnames, + &PyTuple_Type, &self->defargs, + &can_fallback, + &has_stararg, + &exact_match_required + )) { + return -1; + } + Py_INCREF(self->argnames); + Py_INCREF(self->defargs); + tmaddr = PyLong_AsVoidPtr(tmaddrobj); + self->tm = static_cast(tmaddr); + self->argct = argct; + self->can_compile = 1; + self->enable_sysmon = 0; // default to turn off sys.monitoring + self->can_fallback = can_fallback; + self->fallbackdef = NULL; + self->has_stararg = has_stararg; + self->exact_match_required = exact_match_required; + return 0; +} + +static PyObject * +Dispatcher_clear(Dispatcher *self, PyObject *args) +{ + self->clear(); + Py_RETURN_NONE; +} + +static +PyObject* +Dispatcher_Insert(Dispatcher *self, PyObject *args, PyObject *kwds) +{ + /* The cuda kwarg is a temporary addition until CUDA overloads are compiled + * functions. Once they are compiled functions, kwargs can be removed from + * this function. */ + static char *keywords[] = { + (char*)"sig", + (char*)"func", + (char*)"objectmode", + (char*)"cuda", + NULL + }; + + PyObject *sigtup, *cfunc; + int i, sigsz; + int *sig; + int objectmode = 0; + int cuda = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|ip", keywords, &sigtup, + &cfunc, &objectmode, &cuda)) { + return NULL; + } + + if (!cuda && !PyObject_TypeCheck(cfunc, &PyCFunction_Type) ) { + PyErr_SetString(PyExc_TypeError, "must be builtin_function_or_method"); + return NULL; + } + + sigsz = PySequence_Fast_GET_SIZE(sigtup); + sig = new int[sigsz]; + + for (i = 0; i < sigsz; ++i) { + sig[i] = PyLong_AsLong(PySequence_Fast_GET_ITEM(sigtup, i)); + } + + /* The reference to cfunc is borrowed; this only works because the + derived Python class also stores an (owned) reference to cfunc. */ + self->addDefinition(sig, cfunc); + + /* Add pure python fallback */ + if (!self->fallbackdef && objectmode){ + self->fallbackdef = cfunc; + } + + delete[] sig; + + Py_RETURN_NONE; +} + +static +void explain_issue(PyObject *dispatcher, PyObject *args, PyObject *kws, + const char *method_name, const char *default_msg) +{ + PyObject *callback, *result; + callback = PyObject_GetAttrString(dispatcher, method_name); + if (!callback) { + PyErr_SetString(PyExc_TypeError, default_msg); + return; + } + result = PyObject_Call(callback, args, kws); + Py_DECREF(callback); + if (result != NULL) { + PyErr_Format(PyExc_RuntimeError, "%s must raise an exception", + method_name); + Py_DECREF(result); + } +} + +static +void explain_ambiguous(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + explain_issue(dispatcher, args, kws, "_explain_ambiguous", + "Ambiguous overloading"); +} + +static +void explain_matching_error(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + explain_issue(dispatcher, args, kws, "_explain_matching_error", + "No matching definition"); +} + +static +int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + PyObject *callback, *result; + int res; + + callback = PyObject_GetAttrString(dispatcher, + "_search_new_conversions"); + if (!callback) { + return -1; + } + result = PyObject_Call(callback, args, kws); + Py_DECREF(callback); + if (result == NULL) { + return -1; + } + if (!PyBool_Check(result)) { + Py_DECREF(result); + PyErr_SetString(PyExc_TypeError, + "_search_new_conversions() should return a boolean"); + return -1; + } + res = (result == Py_True) ? 1 : 0; + Py_DECREF(result); + return res; +} + + +#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11)) + +/* A custom, fast, inlinable version of PyCFunction_Call() */ +static PyObject * +call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals) +{ + PyCFunctionWithKeywords fn; + PyThreadState *tstate; + + assert(PyCFunction_Check(cfunc)); + assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS)); + fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc); + tstate = PyThreadState_GET(); + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11) + /* + * On Python 3.11, _PyEval_EvalFrameDefault stops using PyTraceInfo since + * it's now baked into ThreadState. + * https://github.com/python/cpython/pull/26623 + */ + if (tstate->cframe->use_tracing && tstate->c_profilefunc) +#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10) + /* + * On Python 3.10+ trace_info comes from somewhere up in PyFrameEval et al, + * Numba doesn't have access to that so creates an equivalent struct and + * wires it up against the cframes. This is passed into the tracing + * functions. + * + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L1611-L1622 + */ + PyTraceInfo trace_info; + trace_info.code = NULL; // not initialized + CFrame *prev_cframe = tstate->cframe; + trace_info.cframe.use_tracing = prev_cframe->use_tracing; + trace_info.cframe.previous = prev_cframe; + + if (trace_info.cframe.use_tracing && tstate->c_profilefunc) +#else + /* + * On Python prior to 3.10, tracing state is a member of the threadstate + */ + if (tstate->use_tracing && tstate->c_profilefunc) +#endif + { + /* + * The following code requires some explaining: + * + * We want the jit-compiled function to be visible to the profiler, so we + * need to synthesize a frame for it. + * The PyFrame_New() constructor doesn't do anything with the 'locals' value if the 'code's + * 'CO_NEWLOCALS' flag is set (which is always the case nowadays). + * So, to get local variables into the frame, we have to manually set the 'f_locals' + * member, then call `PyFrame_LocalsToFast`, where a subsequent call to the `frame.f_locals` + * property (by virtue of the `frame_getlocals` function in frameobject.c) will find them. + */ + PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__"); + PyObject *globals = PyDict_New(); + PyObject *builtins = PyEval_GetBuiltins(); + PyFrameObject *frame = NULL; + PyObject *result = NULL; +#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10)) + // Only used in 3.10, to help with saving/restoring exception state + PyObject *pyexc = NULL; + PyObject *err_type = NULL; + PyObject *err_value = NULL; + PyObject *err_traceback = NULL; +#endif + + if (!code) { + PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found."); + goto error; + } + /* Populate builtins, which is required by some JITted functions */ + if (PyDict_SetItemString(globals, "__builtins__", builtins)) { + goto error; + } + + /* unset the CO_OPTIMIZED flag, make the frame get a new locals dict */ + code->co_flags &= 0xFFFE; + + frame = PyFrame_New(tstate, code, globals, locals); + if (frame == NULL) { + goto error; + } +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11) + // Python 3.11 improved the frame infrastructure such that frames are + // updated by the virtual machine, no need to do PyFrame_LocalsToFast + // and PyFrame_FastToLocals to ensure `frame->f_locals` is consistent. + C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws), frame); +#else + // Populate the 'fast locals' in `frame` + PyFrame_LocalsToFast(frame, 0); + tstate->frame = frame; + + // make the call + C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws)); + + // write changes back to locals? + // PyFrame_FastToLocals can clear the exception indicator, therefore + // this state needs saving and restoring across the call if the + // exception indicator is set. + pyexc = PyErr_Occurred(); + if (pyexc != NULL) { + PyErr_Fetch(&err_type, &err_value, &err_traceback); + } + PyFrame_FastToLocals(frame); + if (pyexc != NULL) { + PyErr_Restore(err_type, err_value, err_traceback); + } + tstate->frame = frame->f_back; +#endif + error: + Py_XDECREF(frame); + Py_XDECREF(globals); + Py_XDECREF(code); + return result; + } + else + { + return fn(PyCFunction_GET_SELF(cfunc), args, kws); + } +} + +#elif (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) + +// Python 3.12 has a completely new approach to tracing and profiling due to +// the new `sys.monitoring` system. + +// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L863-L868 + +static const int8_t MOST_SIG_BIT[16] = {-1, 0, 1, 1, + 2, 2, 2, 2, + 3, 3, 3, 3, + 3, 3, 3, 3}; + +// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L873-L879 + +static inline int msb(uint8_t bits) { + if (bits > 15) { + return MOST_SIG_BIT[bits>>4]+4; + } + return MOST_SIG_BIT[bits]; +} + + +static int invoke_monitoring(PyThreadState * tstate, int event, Dispatcher *self, PyObject* retval) +{ + // This will invoke monitoring tools (if present) for the event `event`. + // + // Arguments: + // tstate - the interpreter thread state + // event - an event as defined in internal/pycore_instruments.h + // self - the dispatcher + // retval - the return value from running the dispatcher machine code (if needed) + // or NULL if not needed. + // + // Return: + // status 0 for success -1 otherwise. + // + // Notes: + // Python 3.12 has a new monitoring system as described in PEP 669. It's + // largely implemented in CPython PR #103083. + // + // This PEP manifests as a set of monitoring instrumentation in the form of + // per-monitoring-tool-type callbacks stored as part of the interpreter + // state (can also be on the code object for "local events" but Numba + // doesn't support those, see the Numba developer docs). From the Python + // interpreter this appears as `sys.monitoring`, from the C-side there's not + // a great deal of public API for the sort of things that Numba wants/needs + // to do. + // + // The new monitoring system is event based, the general idea in the + // following code is to see if a monitoring "tool" has registered a callback + // to run on the presence of a particular event and run those callbacks if + // so. In Numba's case we're just about to disappear into machine code + // that's essentially doing the same thing as the interpreter would if it + // executed the bytecode present in the function that's been JIT compiled. + // As a result we need to tell any tool that has a callback registered for a + // PY_MONITORING_EVENT_PY_START that a Python function is about to start + // (and do something similar for when a function returns/raises). + // This is a total lie as the execution is in machine code, but telling this + // lie makes it look like a python function has started executing at the + // point the machine code function starts and tools like profilers will be + // able to identify this and do something appropriate. The "lie" is very + // much like lie told for Python < 3.12, but the format of the lie is + // different. There is no fake frame involved, it's just about calling an + // appropriate call back, which in a way is a lot less confusing to deal + // with. + // + // For reference, under cProfile all these are NULL, don't even look at + // them, they are legacy, you need to use the monitoring system! + // tstate->c_profilefunc + // tstate->c_profileobj + // tstate->c_tracefunc + // tstate->c_traceobj + // + // Finally: Useful places to look in the CPython code base: + // 1. internal/pycore_instruments.h which has the #defines for all the event + // types and the "types" of tools e.g. debugger, profiler. + // 2. Python/instrumentation.c which is where most of the implementation is + // done. Particularly functions `call_instrumentation_vector` and + // `call_one_instrument`. + // Note that Python/legacy_tracing.c is not somewhere to look, it's just + // wiring old style tracing that has been setup via e.g. C-API + // PyEval_SetProfile into the new monitoring system. + // + // Other things... + // 1. Calls to `sys.monitoring.set_events` clobber the previous state. + // 2. You can register callbacks for an event without having the event set. + // 3. You can set events and have no associated callback. + // 4. Tools are supposed to be respectful of other tools that are + // registered, i.e. not clobber/interfere with each other. + // 5. There are multiple slots for tools, cProfile is a profiler and + // profilers should register in slot 2 by convention. + // + // This is useful for debug: + // To detect whether Python is doing _any_ monitoring it's necessary to + // inspect the per-thread state interpreter monitors.tools member, its a + // uchar[15]. A non-zero value in any tools slot suggests something + // is registered to be called on the occurence of some event. + // + // bool monitoring_tools_present = false; + // for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { + // if (tstate->interp->monitors.tools[i]) { + // monitoring_tools_present = true; + // break; + // } + // } + + // The code in this function is based loosely on a combination of the + // following: + // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L945-L1008 + // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L1010-L1026 + // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L839-L861 + + // TODO: check this, call_instrumentation_vector has this at the top. + if (tstate->tracing){ + return 0; + } + + // Are there any tools set on this thead for this event? + uint8_t tools = tstate->interp->monitors.tools[event]; + // offset value for use in callbacks + PyObject * offset_obj = NULL; + // callback args slots (used in vectorcall protocol) + PyObject * callback_args[3] = {NULL, NULL, NULL}; + + // If so... + if (tools) + { + + + PyObject *result = NULL; + PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__"); // incref code + if (!code) { + PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found."); + return -1; + } + + // TODO: handle local events, see `get_tools_for_instruction`. + // The issue with local events is that they maybe don't make a lot of + // sense in a JIT context. The way it works is that + // `sys.monitoring.set_local_events` takes the code object of a function + // and "instruments" it with respect to the requested events. In + // practice this seems to materialise as swapping bytecodes associated + // with the event bitmask for `INSTRUMENTED_` variants of those + // bytecodes. Then at interpretation time if an instrumented instruction + // is encountered it triggers lookups in the `code->_co_monitoring` + // struct for tools and active monitors etc. In Numba we _know_ the + // bytecode at which the code starts and we can probably scrape the code + // to look for instrumented return instructions, so it is feasible to + // support at least PY_START and PY_RETURN events, however, it's a lot + // of effort for perhaps something that's practically not that useful. + // As a result, only global events are supported at present. + + // This is supposed to be the offset of the + // currently-being-interpreted bytecode instruction. In Numba's case + // there is no bytecode executing. We know that for a PY_START event + // that the offset is probably zero (it might be 2 if there's a + // closure, it's whereever the `RESUME` bytecode appears). However, + // we don't know which bytecode will be associated with the return + // (without huge effort to wire that through to here). Therefore + // zero is also used for return/raise/unwind, the main use case, + // cProfile, seems to manage to do something sensible even though this + // is inaccurate. + offset_obj = PyLong_FromSsize_t(0); // incref offset_obj + + // This is adapted from call_one_instrument. Note that Numba has to care + // about all events even though it only emits fake events for PY_START, + // PY_RETURN, RAISE and PY_UNWIND, this is because of the ability of + // `objmode` to call back into the interpreter and essentially create a + // continued Python execution environment/stack from there. + while(tools) { + // The tools registered are set as bits in `tools` and provide an + // index into monitoring_callables. This is presumably used by + // cPython to detect if the slot of a tool type is already in use so + // that a user can't register more than one tool of a given type at + // the same time. + int tool = msb(tools); + tools ^= (1 << tool); + // Get the instrument at offset `tool` for the event of interest, + // this is a callback function, it also might not be present! It + // is entirely legitimate to have events that have no callback + // and callbacks that have no event. This is to make it relatively + // easy to switch events on and off and ensure that monitoring is + // "lightweight". + PyObject * instrument = (PyObject *)tstate->interp->monitoring_callables[tool][event]; + if (instrument == NULL){ + continue; + } + + // Swap the threadstate "event" for the event of interest and + // increment the tracing tracking field (essentially, inlined + // PyThreadState_EnterTracing). + int old_what = tstate->what_event; + tstate->what_event = event; + tstate->tracing++; + + // Need to call the callback instrument. Need to know the number of + // arguments, this is based on whether the `retval` (return value) + // is NULL (it indicates whether this is a PY_START, or something + // like a PY_RETURN, which has 3 arguments). + size_t nargsf = (retval == NULL ? 2 : 3) | PY_VECTORCALL_ARGUMENTS_OFFSET; + + // call the instrumentation, look at the args to the callback + // functions for sys.monitoring events to find out what the + // arguments are. e.g. + // PY_START has `func(code: CodeType, instruction_offset: int)` + // whereas + // PY_RETURN has `func(code: CodeType, instruction_offset: int, retval: object)` + // and + // CALL, C_RAISE, C_RETURN has `func(code: CodeType, instruction_offset: int, callable: object, arg0 object|MISSING)` + // i.e. the signature changes based on context. This influences the + // value of `nargsf` and what is wired into `callback_args`. First two + // arguments are always code and offset, optional third arg is + // the return value. + callback_args[0] = (PyObject*)code; + callback_args[1] = (PyObject*)offset_obj; + callback_args[2] = (PyObject*)retval; + PyObject ** callargs = &callback_args[0]; + + // finally, stage the call the the instrument + result = PyObject_Vectorcall(instrument, callargs, nargsf, NULL); + + // decrement the tracing tracking field and set the event back to + // the original event (essentially, inlined + // PyThreadState_LeaveTracing). + tstate->tracing--; + tstate->what_event = old_what; + + if (result == NULL){ + // Error occurred in call to instrumentation. + Py_XDECREF(offset_obj); + Py_XDECREF(code); + return -1; + } + } + Py_XDECREF(offset_obj); + Py_XDECREF(code); + } + return 0; +} + +/* invoke monitoring for PY_START if it is set */ +int static inline invoke_monitoring_PY_START(PyThreadState * tstate, Dispatcher *self) { + return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_START, self, NULL); +} + +/* invoke monitoring for PY_RETURN if it is set */ +int static inline invoke_monitoring_PY_RETURN(PyThreadState * tstate, Dispatcher *self, PyObject * retval) { + return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_RETURN, self, retval); +} + +/* invoke monitoring for RAISE if it is set */ +int static inline invoke_monitoring_RAISE(PyThreadState * tstate, Dispatcher *self, PyObject * exception) { + return invoke_monitoring(tstate, PY_MONITORING_EVENT_RAISE, self, exception); +} + +/* invoke monitoring for PY_UNWIND if it is set */ +int static inline invoke_monitoring_PY_UNWIND(PyThreadState * tstate, Dispatcher *self, PyObject * exception) { + return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_UNWIND, self, exception); +} + +/* forward declaration */ +bool static is_sysmon_enabled(Dispatcher *self); + +/* A custom, fast, inlinable version of PyCFunction_Call() */ +static PyObject * +call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals) +{ + PyCFunctionWithKeywords fn = NULL; + PyThreadState *tstate = NULL; + PyObject * pyresult = NULL; + PyObject * pyexception = NULL; + const bool enabled_sysmon = is_sysmon_enabled(self); + + assert(PyCFunction_Check(cfunc)); + assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS)); + fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc); + tstate = PyThreadState_GET(); + // issue PY_START if event is set + if(enabled_sysmon && invoke_monitoring_PY_START(tstate, self) != 0){ + return NULL; + } + // make call + pyresult = fn(PyCFunction_GET_SELF(cfunc), args, kws); + if (enabled_sysmon && pyresult == NULL) { + // pyresult == NULL, which means the Numba function raised an exception + // which is now pending. + // + // NOTE: that _ALL_ exceptions trigger the RAISE event, even a + // StopIteration exception. To get a STOP_ITERATION event, the + // StopIteration exception must be "implied" i.e. a for loop exhausting + // a generator, whereas those coming from the executing the binary + // wrapped in this dispatcher must always be explicit (this is after all + // a function dispatcher). + // + // NOTE: That it is necessary to trigger both a `RAISE` event, as this + // triggered by an exception being raised, and a `PY_UNWIND` event, as + // this is the event for "exiting from a python function during + // exception unwinding" (see CPython sys.monitoring docs). + // + // In the following, if the call to PyErr_GetRaisedException returns + // NULL, it means that something has cleared the error indicator and + // this is a most surprising state to occur (shouldn't be possible!). + // + // TODO: This makes the exception raising path a little slower as the + // exception state is suspended and resumed regardless of whether + // monitoring for such an event is set. In future it might be worth + // checking the tstate->interp->monitors.tools[event] and only doing the + // suspend/resume if something is listening for the event. + pyexception = PyErr_GetRaisedException(); + if (pyexception != NULL) { + if(invoke_monitoring_RAISE(tstate, self, pyexception) != 0){ + // If the monitoring callback raised, return NULL so that the + // exception can propagate. + return NULL; + } + if(invoke_monitoring_PY_UNWIND(tstate, self, pyexception) != 0){ + // If the monitoring callback raised, return NULL so that the + // exception can propagate. + return NULL; + } + // reset the exception + PyErr_SetRaisedException(pyexception); + } + // Exception in Numba call as pyresult == NULL, start to unwind by + // returning NULL. + return NULL; + } + // issue PY_RETURN if event is set + if(enabled_sysmon && invoke_monitoring_PY_RETURN(tstate, self, pyresult) != 0){ + return NULL; + } + return pyresult; +} +#else +#error "Python version is not supported." +#endif + + +static +PyObject* +compile_and_invoke(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals) +{ + /* Compile a new one */ + PyObject *cfa, *cfunc, *retval; + cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args"); + if (cfa == NULL) + return NULL; + + /* NOTE: we call the compiled function ourselves instead of + letting the Python derived class do it. This is for proper + behaviour of globals() in jitted functions (issue #476). */ + cfunc = PyObject_Call(cfa, args, kws); + Py_DECREF(cfa); + + if (cfunc == NULL) + return NULL; + + if (PyObject_TypeCheck(cfunc, &PyCFunction_Type)) { + retval = call_cfunc(self, cfunc, args, kws, locals); + } else { + /* Re-enter interpreter */ + retval = PyObject_Call(cfunc, args, kws); + } + Py_DECREF(cfunc); + + return retval; +} + +/* A copy of compile_and_invoke, that only compiles. This is needed for CUDA + * kernels, because its overloads are Python instances of the _Kernel class, + * rather than compiled functions. Once CUDA overloads are compiled functions, + * cuda_compile_only can be removed. */ +static +PyObject* +cuda_compile_only(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals) +{ + /* Compile a new one */ + PyObject *cfa, *cfunc; + cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args"); + if (cfa == NULL) + return NULL; + + cfunc = PyObject_Call(cfa, args, kws); + Py_DECREF(cfa); + + return cfunc; +} + +static int +find_named_args(Dispatcher *self, PyObject **pargs, PyObject **pkws) +{ + PyObject *oldargs = *pargs, *newargs; + PyObject *kws = *pkws; + Py_ssize_t pos_args = PyTuple_GET_SIZE(oldargs); + Py_ssize_t named_args, total_args, i; + Py_ssize_t func_args = PyTuple_GET_SIZE(self->argnames); + Py_ssize_t defaults = PyTuple_GET_SIZE(self->defargs); + /* Last parameter with a default value */ + Py_ssize_t last_def = (self->has_stararg) + ? func_args - 2 + : func_args - 1; + /* First parameter with a default value */ + Py_ssize_t first_def = last_def - defaults + 1; + /* Minimum number of required arguments */ + Py_ssize_t minargs = first_def; + + if (kws != NULL) + named_args = PyDict_Size(kws); + else + named_args = 0; + total_args = pos_args + named_args; + if (!self->has_stararg && total_args > func_args) { + PyErr_Format(PyExc_TypeError, + "too many arguments: expected %d, got %d", + (int) func_args, (int) total_args); + return -1; + } + else if (total_args < minargs) { + if (minargs == func_args) + PyErr_Format(PyExc_TypeError, + "not enough arguments: expected %d, got %d", + (int) minargs, (int) total_args); + else + PyErr_Format(PyExc_TypeError, + "not enough arguments: expected at least %d, got %d", + (int) minargs, (int) total_args); + return -1; + } + newargs = PyTuple_New(func_args); + if (!newargs) + return -1; + /* First pack the stararg */ + if (self->has_stararg) { + Py_ssize_t stararg_size = Py_MAX(0, pos_args - func_args + 1); + PyObject *stararg = PyTuple_New(stararg_size); + if (!stararg) { + Py_DECREF(newargs); + return -1; + } + for (i = 0; i < stararg_size; i++) { + PyObject *value = PyTuple_GET_ITEM(oldargs, func_args - 1 + i); + Py_INCREF(value); + PyTuple_SET_ITEM(stararg, i, value); + } + /* Put it in last position */ + PyTuple_SET_ITEM(newargs, func_args - 1, stararg); + + } + for (i = 0; i < pos_args; i++) { + PyObject *value = PyTuple_GET_ITEM(oldargs, i); + if (self->has_stararg && i >= func_args - 1) { + /* Skip stararg */ + break; + } + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + } + + /* Iterate over missing positional arguments, try to find them in + named arguments or default values. */ + for (i = pos_args; i < func_args; i++) { + PyObject *name = PyTuple_GET_ITEM(self->argnames, i); + if (self->has_stararg && i >= func_args - 1) { + /* Skip stararg */ + break; + } + if (kws != NULL) { + /* Named argument? */ + PyObject *value = PyDict_GetItem(kws, name); + if (value != NULL) { + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + named_args--; + continue; + } + } + if (i >= first_def && i <= last_def) { + /* Argument has a default value? */ + PyObject *value = PyTuple_GET_ITEM(self->defargs, i - first_def); + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + continue; + } + else if (i < func_args - 1 || !self->has_stararg) { + PyErr_Format(PyExc_TypeError, + "missing argument '%s'", + PyString_AsString(name)); + Py_DECREF(newargs); + return -1; + } + } + if (named_args) { + PyErr_Format(PyExc_TypeError, + "some keyword arguments unexpected"); + Py_DECREF(newargs); + return -1; + } + *pargs = newargs; + *pkws = NULL; + return 0; +} + +static PyObject* +Dispatcher_call(Dispatcher *self, PyObject *args, PyObject *kws) +{ + PyObject *tmptype, *retval = NULL; + int *tys = NULL; + int argct; + int i; + int prealloc[24]; + int matches; + PyObject *cfunc; + PyThreadState *ts = PyThreadState_Get(); + PyObject *locals = NULL; + + /* If compilation is enabled, ensure that an exact match is found and if + * not compile one */ + int exact_match_required = self->can_compile ? 1 : self->exact_match_required; + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) + if (ts->tracing && ts->c_profilefunc) { +#else + if (ts->use_tracing && ts->c_profilefunc) { +#endif + locals = PyEval_GetLocals(); + if (locals == NULL) { + goto CLEANUP; + } + } + if (self->fold_args) { + if (find_named_args(self, &args, &kws)) + return NULL; + } + else + Py_INCREF(args); + /* Now we own a reference to args */ + + argct = PySequence_Fast_GET_SIZE(args); + + if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int))) + tys = prealloc; + else + tys = new int[argct]; + + for (i = 0; i < argct; ++i) { + tmptype = PySequence_Fast_GET_ITEM(args, i); + tys[i] = typeof_typecode((PyObject *) self, tmptype); + if (tys[i] == -1) { + if (self->can_fallback){ + /* We will clear the exception if fallback is allowed. */ + PyErr_Clear(); + } else { + goto CLEANUP; + } + } + } + + /* We only allow unsafe conversions if compilation of new specializations + has been disabled. + + Note that the number of matches is returned in matches by resolve, which + accepts it as a reference. */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + + if (matches == 0 && !self->can_compile) { + /* + * If we can't compile a new specialization, look for + * matching signatures for which conversions haven't been + * registered on the C++ TypeManager. + */ + int res = search_new_conversions((PyObject *) self, args, kws); + if (res < 0) { + retval = NULL; + goto CLEANUP; + } + if (res > 0) { + /* Retry with the newly registered conversions */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + } + } + if (matches == 1) { + /* Definition is found */ + retval = call_cfunc(self, cfunc, args, kws, locals); + } else if (matches == 0) { + /* No matching definition */ + if (self->can_compile) { + retval = compile_and_invoke(self, args, kws, locals); + } else if (self->fallbackdef) { + /* Have object fallback */ + retval = call_cfunc(self, self->fallbackdef, args, kws, locals); + } else { + /* Raise TypeError */ + explain_matching_error((PyObject *) self, args, kws); + retval = NULL; + } + } else if (self->can_compile) { + /* Ambiguous, but are allowed to compile */ + retval = compile_and_invoke(self, args, kws, locals); + } else { + /* Ambiguous */ + explain_ambiguous((PyObject *) self, args, kws); + retval = NULL; + } + +CLEANUP: + if (tys != prealloc) + delete[] tys; + Py_DECREF(args); + + return retval; +} + +/* Based on Dispatcher_call above, with the following differences: + 1. It does not invoke the definition of the function. + 2. It returns the definition, instead of a value returned by the function. + + This is because CUDA functions are, at present, _Kernel objects rather than + compiled functions. */ +static PyObject* +Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws) +{ + PyObject *tmptype, *retval = NULL; + int *tys = NULL; + int argct; + int i; + int prealloc[24]; + int matches; + PyObject *cfunc; + PyThreadState *ts = PyThreadState_Get(); + PyObject *locals = NULL; + + /* If compilation is enabled, ensure that an exact match is found and if + * not compile one */ + int exact_match_required = self->can_compile ? 1 : self->exact_match_required; + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) + if (ts->tracing && ts->c_profilefunc) { +#else + if (ts->use_tracing && ts->c_profilefunc) { +#endif + locals = PyEval_GetLocals(); + if (locals == NULL) { + goto CLEANUP; + } + } + if (self->fold_args) { + if (find_named_args(self, &args, &kws)) + return NULL; + } + else + Py_INCREF(args); + /* Now we own a reference to args */ + + argct = PySequence_Fast_GET_SIZE(args); + + if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int))) + tys = prealloc; + else + tys = new int[argct]; + + for (i = 0; i < argct; ++i) { + tmptype = PySequence_Fast_GET_ITEM(args, i); + tys[i] = typeof_typecode((PyObject *) self, tmptype); + if (tys[i] == -1) { + if (self->can_fallback){ + /* We will clear the exception if fallback is allowed. */ + PyErr_Clear(); + } else { + goto CLEANUP; + } + } + } + + /* We only allow unsafe conversions if compilation of new specializations + has been disabled. */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + + if (matches == 0 && !self->can_compile) { + /* + * If we can't compile a new specialization, look for + * matching signatures for which conversions haven't been + * registered on the C++ TypeManager. + */ + int res = search_new_conversions((PyObject *) self, args, kws); + if (res < 0) { + retval = NULL; + goto CLEANUP; + } + if (res > 0) { + /* Retry with the newly registered conversions */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + } + } + + if (matches == 1) { + /* Definition is found */ + retval = cfunc; + Py_INCREF(retval); + } else if (matches == 0) { + /* No matching definition */ + if (self->can_compile) { + retval = cuda_compile_only(self, args, kws, locals); + } else if (self->fallbackdef) { + /* Have object fallback */ + retval = call_cfunc(self, self->fallbackdef, args, kws, locals); + } else { + /* Raise TypeError */ + explain_matching_error((PyObject *) self, args, kws); + retval = NULL; + } + } else if (self->can_compile) { + /* Ambiguous, but are allowed to compile */ + retval = cuda_compile_only(self, args, kws, locals); + } else { + /* Ambiguous */ + explain_ambiguous((PyObject *) self, args, kws); + retval = NULL; + } + +CLEANUP: + if (tys != prealloc) + delete[] tys; + Py_DECREF(args); + + return retval; +} + +static int +import_devicearray(void) +{ + PyObject *devicearray = PyImport_ImportModule("numba._devicearray"); + if (devicearray == NULL) { + return -1; + } + Py_DECREF(devicearray); + + DeviceArray_API = (void**)PyCapsule_Import("numba._devicearray._DEVICEARRAY_API", 0); + if (DeviceArray_API == NULL) { + return -1; + } + + return 0; +} + +static PyMethodDef Dispatcher_methods[] = { + { "_clear", (PyCFunction)Dispatcher_clear, METH_NOARGS, NULL }, + { "_insert", (PyCFunction)Dispatcher_Insert, METH_VARARGS | METH_KEYWORDS, + "insert new definition"}, + { "_cuda_call", (PyCFunction)Dispatcher_cuda_call, + METH_VARARGS | METH_KEYWORDS, "CUDA call resolution" }, + { NULL }, +}; + +static PyMemberDef Dispatcher_members[] = { + {(char*)"_can_compile", T_BOOL, offsetof(Dispatcher, can_compile), 0, NULL }, + {(char*)"_enable_sysmon", T_BOOL, offsetof(Dispatcher, enable_sysmon), 0, NULL }, + {NULL} /* Sentinel */ +}; + + +static PyTypeObject DispatcherType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_dispatcher.Dispatcher", /* tp_name */ + sizeof(Dispatcher), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Dispatcher_dealloc, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + (PyCFunctionWithKeywords)Dispatcher_call, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags*/ + "Dispatcher object", /* tp_doc */ + (traverseproc) Dispatcher_traverse, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Dispatcher_methods, /* tp_methods */ + Dispatcher_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Dispatcher_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ + 0, /* tp_vectorcall */ +#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12) +/* This was introduced first in 3.12 + * https://github.com/python/cpython/issues/91051 + */ + 0, /* tp_watched */ +#endif + +/* WARNING: Do not remove this, only modify it! It is a version guard to + * act as a reminder to update this struct on Python version update! */ +#if (PY_MAJOR_VERSION == 3) +#if ! (NB_SUPPORTED_PYTHON_MINOR) +#error "Python minor version is not supported." +#endif +#else +#error "Python major version is not supported." +#endif +/* END WARNING*/ +}; + + +#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) +static +bool is_sysmon_enabled(Dispatcher * self) { + return self->enable_sysmon; +} +#endif + +static PyObject *compute_fingerprint(PyObject *self, PyObject *args) +{ + PyObject *val; + if (!PyArg_ParseTuple(args, "O:compute_fingerprint", &val)) + return NULL; + return typeof_compute_fingerprint(val); +} + +static PyMethodDef ext_methods[] = { +#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL } + declmethod(typeof_init), + declmethod(compute_fingerprint), + { NULL }, +#undef declmethod +}; + + +MOD_INIT(_dispatcher) { + if (import_devicearray() < 0) { + PyErr_Print(); + PyErr_SetString(PyExc_ImportError, "numba._devicearray failed to import"); + return MOD_ERROR_VAL; + } + + PyObject *m; + MOD_DEF(m, "_dispatcher", "No docs", ext_methods) + if (m == NULL) + return MOD_ERROR_VAL; + + DispatcherType.tp_new = PyType_GenericNew; + if (PyType_Ready(&DispatcherType) < 0) { + return MOD_ERROR_VAL; + } + Py_INCREF(&DispatcherType); + PyModule_AddObject(m, "Dispatcher", (PyObject*)(&DispatcherType)); + + return MOD_SUCCESS_VAL(m); +} diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp b/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp new file mode 100644 index 000000000..d926256d3 --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp @@ -0,0 +1,529 @@ +/* + * This file and _hashtable.h are from CPython 3.5. The symbols have been + * renamed from _Py_hashxxx to _Numba_hashxxx to avoid name clashes with + * the CPython definitions (including at runtime through dynamic linking). + * Those CPython APIs are private and can change in incompatible ways at + * any time. + * + * Command line used for renaming: + * $ sed -i -r 's/\b_Py_(has[h]table)/_Numba_\1/ig' numba/_hashtable.h numba/_hashtable.c + */ + +/* The implementation of the hash table (_Numba_hashtable_t) is based on the cfuhash + project: + http://sourceforge.net/projects/libcfu/ + + Copyright of cfuhash: + ---------------------------------- + Creation date: 2005-06-24 21:22:40 + Authors: Don + Change log: + + Copyright (c) 2005 Don Owens + All rights reserved. + + This code is released under the BSD license: + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the author nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + OF THE POSSIBILITY OF SUCH DAMAGE. + ---------------------------------- +*/ + +#include "_pymodule.h" +#include "_hashtable.h" + +#define HASHTABLE_MIN_SIZE 16 +#define HASHTABLE_HIGH 0.50 +#define HASHTABLE_LOW 0.10 +#define HASHTABLE_REHASH_FACTOR 2.0 / (HASHTABLE_LOW + HASHTABLE_HIGH) + +#define BUCKETS_HEAD(SLIST) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(SLIST))) +#define TABLE_HEAD(HT, BUCKET) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(HT)->buckets[BUCKET])) +#define ENTRY_NEXT(ENTRY) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_ITEM_NEXT(ENTRY)) +#define HASHTABLE_ITEM_SIZE(HT) \ + (sizeof(_Numba_hashtable_entry_t) + (HT)->data_size) + +/* Forward declaration */ +static void hashtable_rehash(_Numba_hashtable_t *ht); + +static void +_Py_slist_init(_Py_slist_t *list) +{ + list->head = NULL; +} + +static void +_Py_slist_prepend(_Py_slist_t *list, _Py_slist_item_t *item) +{ + item->next = list->head; + list->head = item; +} + +static void +_Py_slist_remove(_Py_slist_t *list, _Py_slist_item_t *previous, + _Py_slist_item_t *item) +{ + if (previous != NULL) + previous->next = item->next; + else + list->head = item->next; +} + +extern "C" Py_uhash_t +_Numba_hashtable_hash_int(const void *key) +{ + return (Py_uhash_t)key; +} + +extern "C" Py_uhash_t +_Numba_hashtable_hash_ptr(const void *key) +{ + return (Py_uhash_t)_Py_HashPointer((void *)key); +} + +extern "C" int +_Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry) +{ + return entry->key == key; +} + +/* makes sure the real size of the buckets array is a power of 2 */ +static size_t +round_size(size_t s) +{ + size_t i; + if (s < HASHTABLE_MIN_SIZE) + return HASHTABLE_MIN_SIZE; + i = 1; + while (i < s) + i <<= 1; + return i; +} + +extern "C" _Numba_hashtable_t * +_Numba_hashtable_new_full(size_t data_size, size_t init_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func, + _Numba_hashtable_copy_data_func copy_data_func, + _Numba_hashtable_free_data_func free_data_func, + _Numba_hashtable_get_data_size_func get_data_size_func, + _Numba_hashtable_allocator_t *allocator) +{ + _Numba_hashtable_t *ht; + size_t buckets_size; + _Numba_hashtable_allocator_t alloc; + + if (allocator == NULL) { + alloc.malloc = PyMem_RawMalloc; + alloc.free = PyMem_RawFree; + } + else + alloc = *allocator; + + ht = (_Numba_hashtable_t *)alloc.malloc(sizeof(_Numba_hashtable_t)); + if (ht == NULL) + return ht; + + ht->num_buckets = round_size(init_size); + ht->entries = 0; + ht->data_size = data_size; + + buckets_size = ht->num_buckets * sizeof(ht->buckets[0]); + ht->buckets = (_Py_slist_t *) alloc.malloc(buckets_size); + if (ht->buckets == NULL) { + alloc.free(ht); + return NULL; + } + memset(ht->buckets, 0, buckets_size); + + ht->hash_func = hash_func; + ht->compare_func = compare_func; + ht->copy_data_func = copy_data_func; + ht->free_data_func = free_data_func; + ht->get_data_size_func = get_data_size_func; + ht->alloc = alloc; + return ht; +} + +extern "C" _Numba_hashtable_t * +_Numba_hashtable_new(size_t data_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func) +{ + return _Numba_hashtable_new_full(data_size, HASHTABLE_MIN_SIZE, + hash_func, compare_func, + NULL, NULL, NULL, NULL); +} + +extern "C" size_t +_Numba_hashtable_size(_Numba_hashtable_t *ht) +{ + size_t size; + size_t hv; + + size = sizeof(_Numba_hashtable_t); + + /* buckets */ + size += ht->num_buckets * sizeof(_Numba_hashtable_entry_t *); + + /* entries */ + size += ht->entries * HASHTABLE_ITEM_SIZE(ht); + + /* data linked from entries */ + if (ht->get_data_size_func) { + for (hv = 0; hv < ht->num_buckets; hv++) { + _Numba_hashtable_entry_t *entry; + + for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) { + void *data; + + data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry); + size += ht->get_data_size_func(data); + } + } + } + return size; +} + +#ifdef Py_DEBUG +extern "C" void +_Numba_hashtable_print_stats(_Numba_hashtable_t *ht) +{ + size_t size; + size_t chain_len, max_chain_len, total_chain_len, nchains; + _Numba_hashtable_entry_t *entry; + size_t hv; + double load; + + size = _Numba_hashtable_size(ht); + + load = (double)ht->entries / ht->num_buckets; + + max_chain_len = 0; + total_chain_len = 0; + nchains = 0; + for (hv = 0; hv < ht->num_buckets; hv++) { + entry = TABLE_HEAD(ht, hv); + if (entry != NULL) { + chain_len = 0; + for (; entry; entry = ENTRY_NEXT(entry)) { + chain_len++; + } + if (chain_len > max_chain_len) + max_chain_len = chain_len; + total_chain_len += chain_len; + nchains++; + } + } + printf("hash table %p: entries=%" + PY_FORMAT_SIZE_T "u/%" PY_FORMAT_SIZE_T "u (%.0f%%), ", + ht, ht->entries, ht->num_buckets, load * 100.0); + if (nchains) + printf("avg_chain_len=%.1f, ", (double)total_chain_len / nchains); + printf("max_chain_len=%" PY_FORMAT_SIZE_T "u, %" PY_FORMAT_SIZE_T "u kB\n", + max_chain_len, size / 1024); +} +#endif + +/* Get an entry. Return NULL if the key does not exist. */ +extern "C" _Numba_hashtable_entry_t * +_Numba_hashtable_get_entry(_Numba_hashtable_t *ht, const void *key) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry; + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) { + if (entry->key_hash == key_hash && ht->compare_func(key, entry)) + break; + } + + return entry; +} + +static int +_hashtable_pop_entry(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry, *previous; + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + previous = NULL; + for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) { + if (entry->key_hash == key_hash && ht->compare_func(key, entry)) + break; + previous = entry; + } + + if (entry == NULL) + return 0; + + _Py_slist_remove(&ht->buckets[index], (_Py_slist_item_t *)previous, + (_Py_slist_item_t *)entry); + ht->entries--; + + if (data != NULL) + _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry); + ht->alloc.free(entry); + + if ((float)ht->entries / (float)ht->num_buckets < HASHTABLE_LOW) + hashtable_rehash(ht); + return 1; +} + +/* Add a new entry to the hash. The key must not be present in the hash table. + Return 0 on success, -1 on memory error. */ +extern "C" int +_Numba_hashtable_set(_Numba_hashtable_t *ht, const void *key, + void *data, size_t data_size) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry; + + assert(data != NULL || data_size == 0); +#ifndef NDEBUG + /* Don't write the assertion on a single line because it is interesting + to know the duplicated entry if the assertion failed. The entry can + be read using a debugger. */ + entry = _Numba_hashtable_get_entry(ht, key); + assert(entry == NULL); +#endif + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + entry = (_Numba_hashtable_entry_t *) ht->alloc.malloc(HASHTABLE_ITEM_SIZE(ht)); + if (entry == NULL) { + /* memory allocation failed */ + return -1; + } + + entry->key = (void *)key; + entry->key_hash = key_hash; + + assert(data_size == ht->data_size); + memcpy(_Numba_HASHTABLE_ENTRY_DATA(entry), data, data_size); + + _Py_slist_prepend(&ht->buckets[index], (_Py_slist_item_t*)entry); + ht->entries++; + + if ((float)ht->entries / (float)ht->num_buckets > HASHTABLE_HIGH) + hashtable_rehash(ht); + return 0; +} + +/* Get data from an entry. Copy entry data into data and return 1 if the entry + exists, return 0 if the entry does not exist. */ +extern "C" int +_Numba_hashtable_get(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + _Numba_hashtable_entry_t *entry; + + assert(data != NULL); + + entry = _Numba_hashtable_get_entry(ht, key); + if (entry == NULL) + return 0; + _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry); + return 1; +} + +extern "C" int +_Numba_hashtable_pop(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + assert(data != NULL); + assert(ht->free_data_func == NULL); + return _hashtable_pop_entry(ht, key, data, data_size); +} + +/* Delete an entry. The entry must exist. */ +extern "C" void +_Numba_hashtable_delete(_Numba_hashtable_t *ht, const void *key) +{ +#ifndef NDEBUG + int found = _hashtable_pop_entry(ht, key, NULL, 0); + assert(found); +#else + (void)_hashtable_pop_entry(ht, key, NULL, 0); +#endif +} + +/* Prototype for a pointer to a function to be called foreach + key/value pair in the hash by hashtable_foreach(). Iteration + stops if a non-zero value is returned. */ +extern "C" int +_Numba_hashtable_foreach(_Numba_hashtable_t *ht, + int (*func) (_Numba_hashtable_entry_t *entry, void *arg), + void *arg) +{ + _Numba_hashtable_entry_t *entry; + size_t hv; + + for (hv = 0; hv < ht->num_buckets; hv++) { + for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) { + int res = func(entry, arg); + if (res) + return res; + } + } + return 0; +} + +static void +hashtable_rehash(_Numba_hashtable_t *ht) +{ + size_t buckets_size, new_size, bucket; + _Py_slist_t *old_buckets = NULL; + size_t old_num_buckets; + + new_size = round_size((size_t)(ht->entries * HASHTABLE_REHASH_FACTOR)); + if (new_size == ht->num_buckets) + return; + + old_num_buckets = ht->num_buckets; + + buckets_size = new_size * sizeof(ht->buckets[0]); + old_buckets = ht->buckets; + ht->buckets = (_Py_slist_t *) ht->alloc.malloc(buckets_size); + if (ht->buckets == NULL) { + /* cancel rehash on memory allocation failure */ + ht->buckets = old_buckets ; + /* memory allocation failed */ + return; + } + memset(ht->buckets, 0, buckets_size); + + ht->num_buckets = new_size; + + for (bucket = 0; bucket < old_num_buckets; bucket++) { + _Numba_hashtable_entry_t *entry, *next; + for (entry = BUCKETS_HEAD(old_buckets[bucket]); entry != NULL; entry = next) { + size_t entry_index; + + assert(ht->hash_func(entry->key) == entry->key_hash); + next = ENTRY_NEXT(entry); + entry_index = entry->key_hash & (new_size - 1); + + _Py_slist_prepend(&ht->buckets[entry_index], (_Py_slist_item_t*)entry); + } + } + + ht->alloc.free(old_buckets); +} + +extern "C" void +_Numba_hashtable_clear(_Numba_hashtable_t *ht) +{ + _Numba_hashtable_entry_t *entry, *next; + size_t i; + + for (i=0; i < ht->num_buckets; i++) { + for (entry = TABLE_HEAD(ht, i); entry != NULL; entry = next) { + next = ENTRY_NEXT(entry); + if (ht->free_data_func) + ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry)); + ht->alloc.free(entry); + } + _Py_slist_init(&ht->buckets[i]); + } + ht->entries = 0; + hashtable_rehash(ht); +} + +extern "C" void +_Numba_hashtable_destroy(_Numba_hashtable_t *ht) +{ + size_t i; + + for (i = 0; i < ht->num_buckets; i++) { + _Py_slist_item_t *entry = ht->buckets[i].head; + while (entry) { + _Py_slist_item_t *entry_next = entry->next; + if (ht->free_data_func) + ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry)); + ht->alloc.free(entry); + entry = entry_next; + } + } + + ht->alloc.free(ht->buckets); + ht->alloc.free(ht); +} + +/* Return a copy of the hash table */ +extern "C" _Numba_hashtable_t * +_Numba_hashtable_copy(_Numba_hashtable_t *src) +{ + _Numba_hashtable_t *dst; + _Numba_hashtable_entry_t *entry; + size_t bucket; + int err; + void *data, *new_data; + + dst = _Numba_hashtable_new_full(src->data_size, src->num_buckets, + src->hash_func, src->compare_func, + src->copy_data_func, src->free_data_func, + src->get_data_size_func, &src->alloc); + if (dst == NULL) + return NULL; + + for (bucket=0; bucket < src->num_buckets; bucket++) { + entry = TABLE_HEAD(src, bucket); + for (; entry; entry = ENTRY_NEXT(entry)) { + if (src->copy_data_func) { + data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry); + new_data = src->copy_data_func(data); + if (new_data != NULL) + err = _Numba_hashtable_set(dst, entry->key, + &new_data, src->data_size); + else + err = 1; + } + else { + data = _Numba_HASHTABLE_ENTRY_DATA(entry); + err = _Numba_hashtable_set(dst, entry->key, data, src->data_size); + } + if (err) { + _Numba_hashtable_destroy(dst); + return NULL; + } + } + } + return dst; +} diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.h b/numba_cuda/numba/cuda/_dispatcher/_hashtable.h new file mode 100644 index 000000000..fbc6d6013 --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_hashtable.h @@ -0,0 +1,132 @@ +/* + * See _hashtable.c for more information about this file. + */ + +#ifndef Py_HASHTABLE_H +#define Py_HASHTABLE_H + +/* The whole API is private */ +#ifndef Py_LIMITED_API + +typedef struct _Py_slist_item_s { + struct _Py_slist_item_s *next; +} _Py_slist_item_t; + +typedef struct { + _Py_slist_item_t *head; +} _Py_slist_t; + +#define _Py_SLIST_ITEM_NEXT(ITEM) (((_Py_slist_item_t *)ITEM)->next) + +#define _Py_SLIST_HEAD(SLIST) (((_Py_slist_t *)SLIST)->head) + +typedef struct { + /* used by _Numba_hashtable_t.buckets to link entries */ + _Py_slist_item_t _Py_slist_item; + + const void *key; + Py_uhash_t key_hash; + + /* data follows */ +} _Numba_hashtable_entry_t; + +#define _Numba_HASHTABLE_ENTRY_DATA(ENTRY) \ + ((char *)(ENTRY) + sizeof(_Numba_hashtable_entry_t)) + +#define _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(ENTRY) \ + (*(void **)_Numba_HASHTABLE_ENTRY_DATA(ENTRY)) + +#define _Numba_HASHTABLE_ENTRY_READ_DATA(TABLE, DATA, DATA_SIZE, ENTRY) \ + do { \ + assert((DATA_SIZE) == (TABLE)->data_size); \ + memcpy(DATA, _Numba_HASHTABLE_ENTRY_DATA(ENTRY), DATA_SIZE); \ + } while (0) + +typedef Py_uhash_t (*_Numba_hashtable_hash_func) (const void *key); +typedef int (*_Numba_hashtable_compare_func) (const void *key, const _Numba_hashtable_entry_t *he); +typedef void* (*_Numba_hashtable_copy_data_func)(void *data); +typedef void (*_Numba_hashtable_free_data_func)(void *data); +typedef size_t (*_Numba_hashtable_get_data_size_func)(void *data); + +typedef struct { + /* allocate a memory block */ + void* (*malloc) (size_t size); + + /* release a memory block */ + void (*free) (void *ptr); +} _Numba_hashtable_allocator_t; + +typedef struct { + size_t num_buckets; + size_t entries; /* Total number of entries in the table. */ + _Py_slist_t *buckets; + size_t data_size; + + _Numba_hashtable_hash_func hash_func; + _Numba_hashtable_compare_func compare_func; + _Numba_hashtable_copy_data_func copy_data_func; + _Numba_hashtable_free_data_func free_data_func; + _Numba_hashtable_get_data_size_func get_data_size_func; + _Numba_hashtable_allocator_t alloc; +} _Numba_hashtable_t; + +/* hash and compare functions for integers and pointers */ +extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_ptr(const void *key); +extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_int(const void *key); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry); + +extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new( + size_t data_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func); +extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new_full( + size_t data_size, + size_t init_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func, + _Numba_hashtable_copy_data_func copy_data_func, + _Numba_hashtable_free_data_func free_data_func, + _Numba_hashtable_get_data_size_func get_data_size_func, + _Numba_hashtable_allocator_t *allocator); +extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_copy(_Numba_hashtable_t *src); +extern "C" PyAPI_FUNC(void) _Numba_hashtable_clear(_Numba_hashtable_t *ht); +extern "C" PyAPI_FUNC(void) _Numba_hashtable_destroy(_Numba_hashtable_t *ht); + +typedef int (*_Numba_hashtable_foreach_func) (_Numba_hashtable_entry_t *entry, void *arg); + +extern "C" PyAPI_FUNC(int) _Numba_hashtable_foreach( + _Numba_hashtable_t *ht, + _Numba_hashtable_foreach_func func, void *arg); +extern "C" PyAPI_FUNC(size_t) _Numba_hashtable_size(_Numba_hashtable_t *ht); + +extern "C" PyAPI_FUNC(_Numba_hashtable_entry_t*) _Numba_hashtable_get_entry( + _Numba_hashtable_t *ht, + const void *key); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_set( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_get( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_pop( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +extern "C" PyAPI_FUNC(void) _Numba_hashtable_delete( + _Numba_hashtable_t *ht, + const void *key); + +#define _Numba_HASHTABLE_SET(TABLE, KEY, DATA) \ + _Numba_hashtable_set(TABLE, KEY, &(DATA), sizeof(DATA)) + +#define _Numba_HASHTABLE_GET(TABLE, KEY, DATA) \ + _Numba_hashtable_get(TABLE, KEY, &(DATA), sizeof(DATA)) + +#endif /* Py_LIMITED_API */ + +#endif diff --git a/numba_cuda/numba/cuda/_dispatcher/_numba_common.h b/numba_cuda/numba/cuda/_dispatcher/_numba_common.h new file mode 100644 index 000000000..d458e4240 --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_numba_common.h @@ -0,0 +1,43 @@ +#ifndef NUMBA_COMMON_H_ +#define NUMBA_COMMON_H_ + +/* __has_attribute() is a clang / gcc-5 macro */ +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + +/* This attribute marks symbols that can be shared across C objects + * but are not exposed outside of a shared library or executable. + * Note this is default behaviour for global symbols under Windows. + */ +#if defined(_MSC_VER) + #define VISIBILITY_HIDDEN + #define VISIBILITY_GLOBAL __declspec(dllexport) +#elif (__has_attribute(visibility) || (defined(__GNUC__) && __GNUC__ >= 4)) + #define VISIBILITY_HIDDEN __attribute__ ((visibility("hidden"))) + #define VISIBILITY_GLOBAL __attribute__ ((visibility("default"))) +#else + #define VISIBILITY_HIDDEN + #define VISIBILITY_GLOBAL +#endif + +/* + * Numba's version of the PyArray_DescrCheck macro from NumPy, use it as a + * direct replacement of NumPy's PyArray_DescrCheck to ensure binary + * compatibility. + * + * Details of why this is needed: + * NumPy 1.18 changed the definition of the PyArray_DescrCheck macro here: + * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698 + * the result of this being that building against NumPy <1.18 would prevent + * Numba running against NumPy >= 1.20 as noted here: + * https://github.com/numba/numba/issues/6041#issuecomment-665132199 + * + * This macro definition is copied from: + * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698#diff-ad2213da23136c5fc5883d9eb2d88666R26 + * + * NOTE: This is the NumPy 1.18 and above version of the macro. + */ +#define NUMBA_PyArray_DescrCheck(op) PyObject_TypeCheck(op, &PyArrayDescr_Type) + +#endif /* NUMBA_COMMON_H_ */ diff --git a/numba_cuda/numba/cuda/_dispatcher/_pymodule.h b/numba_cuda/numba/cuda/_dispatcher/_pymodule.h new file mode 100644 index 000000000..c261314f5 --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_pymodule.h @@ -0,0 +1,35 @@ +#ifndef NUMBA_PY_MODULE_H_ +#define NUMBA_PY_MODULE_H_ + +#define PY_SSIZE_T_CLEAN + +#include "Python.h" +#include "structmember.h" +#include "frameobject.h" + +#define MOD_ERROR_VAL NULL +#define MOD_SUCCESS_VAL(val) val +#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) +#define MOD_DEF(ob, name, doc, methods) { \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, NULL, NULL, NULL, NULL }; \ + ob = PyModule_Create(&moduledef); } +#define MOD_INIT_EXEC(name) PyInit_##name(); + +#define PyString_AsString PyUnicode_AsUTF8 +#define PyString_Check PyUnicode_Check +#define PyString_FromFormat PyUnicode_FromFormat +#define PyString_FromString PyUnicode_FromString +#define PyString_InternFromString PyUnicode_InternFromString +#define PyInt_Type PyLong_Type +#define PyInt_Check PyLong_Check +#define PyInt_CheckExact PyLong_CheckExact +#define SetAttrStringFromVoidPointer(m, name) do { \ + PyObject *tmp = PyLong_FromVoidPtr((void *) &name); \ + PyObject_SetAttrString(m, #name, tmp); \ + Py_DECREF(tmp); } while (0) + + +#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) + +#endif /* NUMBA_PY_MODULE_H_ */ diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp new file mode 100644 index 000000000..99a1bd23e --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp @@ -0,0 +1,1170 @@ +#include "_pymodule.h" + +#include +#include +#include + +#include "_numba_common.h" +#include "_typeof.h" +#include "_hashtable.h" +#include "_devicearray.h" +#include "pyerrors.h" + +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include +#if NPY_ABI_VERSION >= 0x02000000 + #include +#endif + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 13) + #ifndef Py_BUILD_CORE + #define Py_BUILD_CORE 1 + #endif + #include "internal/pycore_setobject.h" // _PySet_NextEntry() +#endif + + +/* Cached typecodes for basic scalar types */ +static int tc_int8; +static int tc_int16; +static int tc_int32; +static int tc_int64; +static int tc_uint8; +static int tc_uint16; +static int tc_uint32; +static int tc_uint64; +static int tc_float32; +static int tc_float64; +static int tc_complex64; +static int tc_complex128; +static int BASIC_TYPECODES[12]; + +static int tc_intp; + +/* The type object for the numba .dispatcher.OmittedArg class + * that wraps omitted arguments. + */ +static PyObject *omittedarg_type; + +static PyObject *typecache; +static PyObject *ndarray_typecache; +static PyObject *structured_dtypes; + +static PyObject *str_typeof_pyval = NULL; +static PyObject *str_value = NULL; +static PyObject *str_numba_type = NULL; + +/* CUDA device array API */ +void **DeviceArray_API; + +/* + * Type fingerprint computation. + */ + +typedef struct { + /* A buffer the fingerprint will be written to */ + char *buf; + size_t n; + size_t allocated; + /* A preallocated buffer, sufficient to fit the fingerprint for most types */ + char static_buf[40]; +} string_writer_t; + +static void +string_writer_init(string_writer_t *w) +{ + w->buf = w->static_buf; + w->n = 0; + w->allocated = sizeof(w->static_buf) / sizeof(unsigned char); +} + +static void +string_writer_clear(string_writer_t *w) +{ + if (w->buf != w->static_buf) + free(w->buf); +} + +static void +string_writer_move(string_writer_t *dest, const string_writer_t *src) +{ + dest->n = src->n; + dest->allocated = src->allocated; + if (src->buf == src->static_buf) { + dest->buf = dest->static_buf; + memcpy(dest->buf, src->buf, src->n); + } + else { + dest->buf = src->buf; + } +} + +/* Ensure at least *bytes* can be appended to the string writer's buffer. */ +static int +string_writer_ensure(string_writer_t *w, size_t bytes) +{ + size_t newsize; + bytes += w->n; + if (bytes <= w->allocated) + return 0; + newsize = (w->allocated << 2) + 1; + if (newsize < bytes) + newsize = bytes; + if (w->buf == w->static_buf) { + w->buf = (char *) malloc(newsize); + memcpy(w->buf, w->static_buf, w->allocated); + } + else + w->buf = (char *) realloc(w->buf, newsize); + if (w->buf) { + w->allocated = newsize; + return 0; + } + else { + PyErr_NoMemory(); + return -1; + } +} + +static int +string_writer_put_char(string_writer_t *w, unsigned char c) +{ + if (string_writer_ensure(w, 1)) + return -1; + w->buf[w->n++] = c; + return 0; +} + +static int +string_writer_put_int32(string_writer_t *w, unsigned int v) +{ + if (string_writer_ensure(w, 4)) + return -1; + w->buf[w->n] = v & 0xff; + w->buf[w->n + 1] = (v >> 8) & 0xff; + w->buf[w->n + 2] = (v >> 16) & 0xff; + w->buf[w->n + 3] = (v >> 24) & 0xff; + w->n += 4; + return 0; +} + +static int +string_writer_put_intp(string_writer_t *w, npy_intp v) +{ + if (string_writer_ensure(w, NPY_SIZEOF_PY_INTPTR_T)) + return -1; + w->buf[w->n] = v & 0xff; + w->buf[w->n + 1] = (v >> 8) & 0xff; + w->buf[w->n + 2] = (v >> 16) & 0xff; + w->buf[w->n + 3] = (v >> 24) & 0xff; +#if NPY_SIZEOF_PY_INTPTR_T == 8 + w->buf[w->n + 4] = (v >> 32) & 0xff; + w->buf[w->n + 5] = (v >> 40) & 0xff; + w->buf[w->n + 6] = (v >> 48) & 0xff; + w->buf[w->n + 7] = (v >> 56) & 0xff; +#endif + w->n += NPY_SIZEOF_PY_INTPTR_T; + return 0; +} + +static int +string_writer_put_string(string_writer_t *w, const char *s) +{ + if (s == NULL) { + return string_writer_put_char(w, 0); + } + else { + size_t N = strlen(s) + 1; + if (string_writer_ensure(w, N)) + return -1; + memcpy(w->buf + w->n, s, N); + w->n += N; + return 0; + } +} + +enum opcode { + OP_START_TUPLE = '(', + OP_END_TUPLE = ')', + OP_INT = 'i', + OP_FLOAT = 'f', + OP_COMPLEX = 'c', + OP_BOOL = '?', + OP_OMITTED = '!', + + OP_BYTEARRAY = 'a', + OP_BYTES = 'b', + OP_NONE = 'n', + OP_LIST = '[', + OP_SET = '{', + + OP_BUFFER = 'B', + OP_NP_SCALAR = 'S', + OP_NP_ARRAY = 'A', + OP_NP_DTYPE = 'D' +}; + +#define TRY(func, w, arg) \ + do { \ + if (func(w, arg)) return -1; \ + } while (0) + + +static int +fingerprint_unrecognized(void) +{ + PyErr_SetString(PyExc_NotImplementedError, + "cannot compute type fingerprint for value"); + return -1; +} + +static int +compute_dtype_fingerprint(string_writer_t *w, PyArray_Descr *descr) +{ + int typenum = descr->type_num; + if (typenum < NPY_OBJECT) + return string_writer_put_char(w, (char) typenum); + if (typenum == NPY_VOID) { + /* Structured dtype: serialize the dtype pointer. Unfortunately, + * some structured dtypes can be ephemeral, so we have to + * intern them to avoid pointer reuse and fingerprint collisions. + * (e.g. np.recarray(dtype=some_dtype) creates a new dtype + * equal to some_dtype) + */ + PyObject *interned = PyDict_GetItem(structured_dtypes, + (PyObject *) descr); + if (interned == NULL) { + interned = (PyObject *) descr; + if (PyDict_SetItem(structured_dtypes, interned, interned)) + return -1; + } + TRY(string_writer_put_char, w, (char) typenum); + return string_writer_put_intp(w, (npy_intp) interned); + } +#if NPY_API_VERSION >= 0x00000007 + if (PyTypeNum_ISDATETIME(typenum)) { + PyArray_DatetimeMetaData *md; +#if NPY_ABI_VERSION >= 0x02000000 + md = &(((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(descr))->meta); +#else + md = &(((PyArray_DatetimeDTypeMetaData *)descr->c_metadata)->meta); +#endif + TRY(string_writer_put_char, w, (char) typenum); + TRY(string_writer_put_char, w, (char) md->base); + return string_writer_put_int32(w, (char) md->num); + } +#endif + + return fingerprint_unrecognized(); +} + +static int +compute_fingerprint(string_writer_t *w, PyObject *val) +{ + /* + * Implementation note: for performance, we start with common + * types that can be tested with fast checks. + */ + if (val == Py_None) + return string_writer_put_char(w, OP_NONE); + if (PyBool_Check(val)) + return string_writer_put_char(w, OP_BOOL); + /* Note we avoid matching int subclasses such as IntEnum */ + if (PyInt_CheckExact(val) || PyLong_CheckExact(val)) + return string_writer_put_char(w, OP_INT); + if (PyFloat_Check(val)) + return string_writer_put_char(w, OP_FLOAT); + if (PyComplex_CheckExact(val)) + return string_writer_put_char(w, OP_COMPLEX); + if (PyTuple_Check(val)) { + if(PyTuple_CheckExact(val)) { + Py_ssize_t i, n; + n = PyTuple_GET_SIZE(val); + TRY(string_writer_put_char, w, OP_START_TUPLE); + for (i = 0; i < n; i++) + TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i)); + TRY(string_writer_put_char, w, OP_END_TUPLE); + return 0; + } + /* as per typeof.py, check "_asdict" for namedtuple. */ + else if(PyObject_HasAttrString(val, "_asdict")) + { + /* + * This encodes the class name and field names of a namedtuple into + * the fingerprint on the condition that the number of fields is + * small (<10) and that the class name and field names are encodable + * as ASCII. + */ + PyObject * clazz = NULL; + PyObject * name = NULL; + PyObject * _fields = PyObject_GetAttrString(val, "_fields"); + PyObject * field = NULL; + PyObject * ascii_str = NULL; + Py_ssize_t i, n, j, flen; + char * buf = NULL; + int ret; + + clazz = PyObject_GetAttrString(val, "__class__"); + if (clazz == NULL) + return -1; + + name = PyObject_GetAttrString(clazz, "__name__"); + Py_DECREF(clazz); + if (name == NULL) + return -1; + + ascii_str = PyUnicode_AsEncodedString(name, "ascii", "ignore"); + Py_DECREF(name); + if (ascii_str == NULL) + return -1; + ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen); + + if (ret == -1) + return -1; + for(j = 0; j < flen; j++) { + TRY(string_writer_put_char, w, buf[j]); + } + Py_DECREF(ascii_str); + + if (_fields == NULL) + return -1; + + n = PyTuple_GET_SIZE(val); + + TRY(string_writer_put_char, w, OP_START_TUPLE); + for (i = 0; i < n; i++) { + field = PyTuple_GET_ITEM(_fields, i); + if (field == NULL) + return -1; + ascii_str = PyUnicode_AsEncodedString(field, "ascii", "ignore"); + if (ascii_str == NULL) + return -1; + ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen); + if (ret == -1) + return -1; + for(j = 0; j < flen; j++) { + TRY(string_writer_put_char, w, buf[j]); + } + Py_DECREF(ascii_str); + TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i)); + } + TRY(string_writer_put_char, w, OP_END_TUPLE); + Py_DECREF(_fields); + return 0; + } + } + if (PyBytes_Check(val)) + return string_writer_put_char(w, OP_BYTES); + if (PyByteArray_Check(val)) + return string_writer_put_char(w, OP_BYTEARRAY); + if ((PyObject *) Py_TYPE(val) == omittedarg_type) { + PyObject *default_val = PyObject_GetAttr(val, str_value); + if (default_val == NULL) + return -1; + TRY(string_writer_put_char, w, OP_OMITTED); + TRY(compute_fingerprint, w, default_val); + Py_DECREF(default_val); + return 0; + } + + /* Skip numpy scalar check to prevent segfault */ + // if (PyArray_IsScalar(val, Generic)) { + // PyArray_Descr *descr = PyArray_DescrFromScalar(val); + // if (descr == NULL) + // return -1; + // TRY(string_writer_put_char, w, OP_NP_SCALAR); + // TRY(compute_dtype_fingerprint, w, descr); + // Py_DECREF(descr); + // return 0; + // } + + /* Skip numpy array check to prevent segfault */ + // if (PyArray_Check(val)) { + // PyArrayObject *ary = (PyArrayObject *) val; + // int ndim = PyArray_NDIM(ary); + + // TRY(string_writer_put_char, w, OP_NP_ARRAY); + // TRY(string_writer_put_int32, w, ndim); + // if (PyArray_IS_C_CONTIGUOUS(ary)) + // TRY(string_writer_put_char, w, 'C'); + // else if (PyArray_IS_F_CONTIGUOUS(ary)) + // TRY(string_writer_put_char, w, 'F'); + // else + // TRY(string_writer_put_char, w, 'A'); + // if (PyArray_ISWRITEABLE(ary)) + // TRY(string_writer_put_char, w, 'W'); + // else + // TRY(string_writer_put_char, w, 'R'); + // return compute_dtype_fingerprint(w, PyArray_DESCR(ary)); + // } + + if (PyList_Check(val)) { + Py_ssize_t n = PyList_GET_SIZE(val); + if (n == 0) { + PyErr_SetString(PyExc_ValueError, + "cannot compute fingerprint of empty list"); + return -1; + } + /* Only the first item is considered, as in typeof.py */ + TRY(string_writer_put_char, w, OP_LIST); + TRY(compute_fingerprint, w, PyList_GET_ITEM(val, 0)); + return 0; + } + /* Note we only accept sets, not frozensets */ + if (Py_TYPE(val) == &PySet_Type) { + Py_hash_t h; + PyObject *item; + Py_ssize_t pos = 0; + /* Only one item is considered, as in typeof.py */ + if (!_PySet_NextEntry(val, &pos, &item, &h)) { + /* Empty set */ + PyErr_SetString(PyExc_ValueError, + "cannot compute fingerprint of empty set"); + return -1; + } + TRY(string_writer_put_char, w, OP_SET); + TRY(compute_fingerprint, w, item); + return 0; + } + if (PyObject_CheckBuffer(val)) { + Py_buffer buf; + int flags = PyBUF_ND | PyBUF_STRIDES | PyBUF_FORMAT; + char contig; + int ndim; + char readonly; + + /* Attempt to get a writable buffer, then fallback on read-only */ + if (PyObject_GetBuffer(val, &buf, flags | PyBUF_WRITABLE)) { + PyErr_Clear(); + if (PyObject_GetBuffer(val, &buf, flags)) + goto _unrecognized; + } + if (PyBuffer_IsContiguous(&buf, 'C')) + contig = 'C'; + else if (PyBuffer_IsContiguous(&buf, 'F')) + contig = 'F'; + else + contig = 'A'; + ndim = buf.ndim; + readonly = buf.readonly ? 'R' : 'W'; + if (string_writer_put_char(w, OP_BUFFER) || + string_writer_put_int32(w, ndim) || + string_writer_put_char(w, contig) || + string_writer_put_char(w, readonly) || + string_writer_put_string(w, buf.format) || + /* We serialize the object's Python type as well, to + distinguish between types which have Numba specializations + (e.g. array.array() vs. memoryview) + */ + string_writer_put_intp(w, (npy_intp) Py_TYPE(val))) { + PyBuffer_Release(&buf); + return -1; + } + PyBuffer_Release(&buf); + return 0; + } + + /* Skip numpy array descriptor check to prevent segfault */ + // if (NUMBA_PyArray_DescrCheck(val)) { + // TRY(string_writer_put_char, w, OP_NP_DTYPE); + // return compute_dtype_fingerprint(w, (PyArray_Descr *) val); + // } + +_unrecognized: + /* Type not recognized */ + return fingerprint_unrecognized(); +} + +PyObject * +typeof_compute_fingerprint(PyObject *val) +{ + PyObject *res; + string_writer_t w; + + string_writer_init(&w); + + if (compute_fingerprint(&w, val)) + goto error; + res = PyBytes_FromStringAndSize(w.buf, w.n); + + string_writer_clear(&w); + return res; + +error: + string_writer_clear(&w); + return NULL; +} + +/* + * Getting the typecode from a Type object. + */ +static int +_typecode_from_type_object(PyObject *tyobj) { + int typecode; + PyObject *tmpcode = PyObject_GetAttrString(tyobj, "_code"); + if (tmpcode == NULL) { + return -1; + } + typecode = PyLong_AsLong(tmpcode); + Py_DECREF(tmpcode); + return typecode; +} + +/* When we want to cache the type's typecode for later lookup, we need to + keep a reference to the returned type object so that it cannot be + deleted. This is because of the following events occurring when first + using a @jit function for a given set of types: + + 1. typecode_fallback requests a new typecode for an arbitrary Python value; + this implies creating a Numba type object (on the first dispatcher call); + the typecode cache is then populated. + 2. matching of the typecode list in _dispatcherimpl.cpp fails, since the + typecode is new. + 3. we have to compile: compile_and_invoke() is called, it will invoke + Dispatcher_Insert to register the new signature. + + The reference to the Numba type object returned in step 1 is deleted as + soon as we call Py_DECREF() on it, since we are holding the only + reference. If this happens and we use the typecode we got to populate the + cache, then the cache won't ever return the correct typecode, and the + dispatcher will never successfully match the typecodes with those of + some already-compiled instance. So we need to make sure that we don't + call Py_DECREF() on objects whose typecode will be used to populate the + cache. This is ensured by calling _typecode_fallback with + retain_reference == 0. + + Note that technically we are leaking the reference, since we do not continue + to hold a pointer to the type object that we get back from typeof_pyval. + However, we don't need to refer to it again, we just need to make sure that + it is never deleted. +*/ +static int +_typecode_fallback(PyObject *dispatcher, PyObject *val, + int retain_reference) { + PyObject *numba_type; + int typecode; + + /* + * For values that define "_numba_type_", which holds a numba Type + * instance that should be used as the type of the value. + * Note this is done here, not in typeof_typecode(), so that + * some values can still benefit from fingerprint caching. + */ + if (str_numba_type != NULL && PyObject_HasAttr(val, str_numba_type)) { + numba_type = PyObject_GetAttrString(val, "_numba_type_"); + if (!numba_type) + return -1; + } + else { + // Go back to the interpreter + numba_type = PyObject_CallMethodObjArgs((PyObject *) dispatcher, + str_typeof_pyval, val, NULL); + } + if (!numba_type) + return -1; + typecode = _typecode_from_type_object(numba_type); + if (!retain_reference) + Py_DECREF(numba_type); + return typecode; +} + +/* Variations on _typecode_fallback for convenience */ + +static +int typecode_fallback(PyObject *dispatcher, PyObject *val) { + return _typecode_fallback(dispatcher, val, 0); +} + +static +int typecode_fallback_keep_ref(PyObject *dispatcher, PyObject *val) { + return _typecode_fallback(dispatcher, val, 1); +} + + +/* A cache mapping fingerprints (string_writer_t *) to typecodes (int). */ +static _Numba_hashtable_t *fingerprint_hashtable = NULL; + +static Py_uhash_t +hash_writer(const void *key) +{ + string_writer_t *writer = (string_writer_t *) key; + Py_uhash_t x = 0; + + /* The old FNV algorithm used by Python 2 */ + if (writer->n > 0) { + unsigned char *p = (unsigned char *) writer->buf; + Py_ssize_t len = writer->n; + x ^= *p << 7; + while (--len >= 0) + x = (1000003*x) ^ *p++; + x ^= writer->n; + if (x == (Py_uhash_t) -1) + x = -2; + } + return x; +} + +static int +compare_writer(const void *key, const _Numba_hashtable_entry_t *entry) +{ + string_writer_t *v = (string_writer_t *) key; + string_writer_t *w = (string_writer_t *) entry->key; + if (v->n != w->n) + return 0; + return memcmp(v->buf, w->buf, v->n) == 0; +} + +/* Try to compute *val*'s typecode using its fingerprint and the + * fingerprint->typecode cache. + */ +static int +typecode_using_fingerprint(PyObject *dispatcher, PyObject *val) +{ + int typecode; + string_writer_t w; + + string_writer_init(&w); + + if (compute_fingerprint(&w, val)) { + string_writer_clear(&w); + if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) { + /* Can't compute a type fingerprint for the given value, + fall back on typeof() without caching. */ + PyErr_Clear(); + return typecode_fallback(dispatcher, val); + } + return -1; + } + + /* Check if hashtable is initialized */ + if (fingerprint_hashtable == NULL) { + string_writer_clear(&w); + return typecode_fallback(dispatcher, val); + } + + if (_Numba_HASHTABLE_GET(fingerprint_hashtable, &w, typecode) > 0) { + /* Cache hit */ + string_writer_clear(&w); + return typecode; + } + + /* Not found in cache: invoke pure Python typeof() and cache result. + * Note we have to keep the type alive forever as explained + * above in _typecode_fallback(). + */ + typecode = typecode_fallback_keep_ref(dispatcher, val); + if (typecode >= 0) { + string_writer_t *key = (string_writer_t *) malloc(sizeof(string_writer_t)); + if (key == NULL) { + string_writer_clear(&w); + PyErr_NoMemory(); + return -1; + } + /* Ownership of the string writer's buffer will be transferred + * to the hash table. + */ + string_writer_move(key, &w); + if (_Numba_HASHTABLE_SET(fingerprint_hashtable, key, typecode)) { + string_writer_clear(&w); + PyErr_NoMemory(); + return -1; + } + } + return typecode; +} + + +/* + * Direct lookup table for extra-fast typecode resolution of simple array types. + */ + +#define N_DTYPES 12 +#define N_NDIM 5 /* Fast path for up to 5D array */ +#define N_LAYOUT 3 +static int cached_arycode[N_NDIM][N_LAYOUT][N_DTYPES]; + +/* Convert a Numpy dtype number to an internal index into cached_arycode. + The returned value must also be a valid index into BASIC_TYPECODES. */ +static int dtype_num_to_typecode(int type_num) { + int dtype; + switch(type_num) { + case NPY_INT8: + dtype = 0; + break; + case NPY_INT16: + dtype = 1; + break; + case NPY_INT32: + dtype = 2; + break; + case NPY_INT64: + dtype = 3; + break; + case NPY_UINT8: + dtype = 4; + break; + case NPY_UINT16: + dtype = 5; + break; + case NPY_UINT32: + dtype = 6; + break; + case NPY_UINT64: + dtype = 7; + break; + case NPY_FLOAT32: + dtype = 8; + break; + case NPY_FLOAT64: + dtype = 9; + break; + case NPY_COMPLEX64: + dtype = 10; + break; + case NPY_COMPLEX128: + dtype = 11; + break; + default: + /* Type not included in the global lookup table */ + dtype = -1; + } + return dtype; +} + +static +int get_cached_typecode(PyArray_Descr* descr) { + PyObject* tmpobject = PyDict_GetItem(typecache, (PyObject*)descr); + if (tmpobject == NULL) + return -1; + + return PyLong_AsLong(tmpobject); +} + +static +void cache_typecode(PyArray_Descr* descr, int typecode) { + PyObject* value = PyLong_FromLong(typecode); + PyDict_SetItem(typecache, (PyObject*)descr, value); + Py_DECREF(value); +} + +static +PyObject* ndarray_key(int ndim, int layout, int readonly, PyArray_Descr* descr) { + PyObject* tmpndim = PyLong_FromLong(ndim); + PyObject* tmplayout = PyLong_FromLong(layout); + PyObject* tmpreadonly = PyLong_FromLong(readonly); + PyObject* key = PyTuple_Pack(4, tmpndim, tmplayout, tmpreadonly, descr); + Py_DECREF(tmpndim); + Py_DECREF(tmplayout); + Py_DECREF(tmpreadonly); + return key; +} + +static +int get_cached_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr) { + PyObject* key = ndarray_key(ndim, layout, readonly, descr); + PyObject *tmpobject = PyDict_GetItem(ndarray_typecache, key); + if (tmpobject == NULL) + return -1; + + Py_DECREF(key); + return PyLong_AsLong(tmpobject); +} + +static +void cache_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr, + int typecode) { + PyObject* key = ndarray_key(ndim, layout, readonly, descr); + PyObject* value = PyLong_FromLong(typecode); + PyDict_SetItem(ndarray_typecache, key, value); + Py_DECREF(key); + Py_DECREF(value); +} + +static +int typecode_ndarray(PyObject *dispatcher, PyArrayObject *ary) { + int typecode; + int dtype; + int ndim = PyArray_NDIM(ary); + int layout = 0; + int readonly = 0; + + /* The order in which we check for the right contiguous-ness is important. + The order must match the order by numba.numpy_support.map_layout. + Further, only *contiguous-ness* is checked, not alignment, byte order or + write permissions. + */ + if (PyArray_IS_C_CONTIGUOUS(ary)){ + layout = 1; + } else if (PyArray_IS_F_CONTIGUOUS(ary)) { + layout = 2; + } + + /* the typecode cache by convention is for "behaved" arrays (aligned and + * writeable), all others must be forced to the fall back */ + if (!PyArray_ISBEHAVED(ary)) goto FALLBACK; + + if (ndim <= 0 || ndim > N_NDIM) goto FALLBACK; + + dtype = dtype_num_to_typecode(PyArray_TYPE(ary)); + if (dtype == -1) goto FALLBACK; + + /* Fast path, using direct table lookup */ + assert(layout < N_LAYOUT); + assert(ndim <= N_NDIM); + assert(dtype < N_DTYPES); + + typecode = cached_arycode[ndim - 1][layout][dtype]; + if (typecode == -1) { + /* First use of this table entry, so it requires populating */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cached_arycode[ndim - 1][layout][dtype] = typecode; + } + return typecode; + +FALLBACK: + /* Slower path, for non-trivial array types */ + + /* If this isn't a structured array then we can't use the cache */ + if (PyArray_TYPE(ary) != NPY_VOID) + return typecode_using_fingerprint(dispatcher, (PyObject *) ary); + + /* Check type cache */ + readonly = !PyArray_ISWRITEABLE(ary); + typecode = get_cached_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary)); + if (typecode == -1) { + /* First use of this type, use fallback and populate the cache */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cache_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary), typecode); + } + return typecode; +} + +static +int typecode_arrayscalar(PyObject *dispatcher, PyObject* aryscalar) { + int typecode; + PyArray_Descr *descr; + descr = PyArray_DescrFromScalar(aryscalar); + if (!descr) + return typecode_using_fingerprint(dispatcher, aryscalar); + + /* Is it a structured scalar? */ + if (descr->type_num == NPY_VOID) { + typecode = get_cached_typecode(descr); + if (typecode == -1) { + /* Resolve through fallback then populate cache */ + typecode = typecode_fallback_keep_ref(dispatcher, aryscalar); + cache_typecode(descr, typecode); + } + Py_DECREF(descr); + return typecode; + } + + /* Is it one of the well-known basic types? */ + typecode = dtype_num_to_typecode(descr->type_num); + Py_DECREF(descr); + if (typecode == -1) + return typecode_using_fingerprint(dispatcher, aryscalar); + return BASIC_TYPECODES[typecode]; +} + +static +int typecode_devicendarray(PyObject *dispatcher, PyObject *ary) +{ + int typecode; + int dtype; + int ndim; + int layout = 0; + PyObject *ndim_obj = nullptr; + PyObject *num_obj = nullptr; + PyObject *dtype_obj = nullptr; + int dtype_num = 0; + + PyObject* flags = PyObject_GetAttrString(ary, "flags"); + if (flags == NULL) + { + PyErr_Clear(); + goto FALLBACK; + } + + if (PyDict_GetItemString(flags, "C_CONTIGUOUS") == Py_True) { + layout = 1; + } else if (PyDict_GetItemString(flags, "F_CONTIGUOUS") == Py_True) { + layout = 2; + } + + Py_DECREF(flags); + + ndim_obj = PyObject_GetAttrString(ary, "ndim"); + if (ndim_obj == NULL) { + /* If there's no ndim, try to proceed by clearing the error and using the + * fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + ndim = PyLong_AsLong(ndim_obj); + Py_DECREF(ndim_obj); + + if (PyErr_Occurred()) { + /* ndim wasn't an integer for some reason - unlikely to happen, but try + * the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + if (ndim <= 0 || ndim > N_NDIM) + goto FALLBACK; + + dtype_obj = PyObject_GetAttrString(ary, "dtype"); + if (dtype_obj == NULL) { + /* No dtype: try the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + num_obj = PyObject_GetAttrString(dtype_obj, "num"); + Py_DECREF(dtype_obj); + + if (num_obj == NULL) { + /* This strange dtype has no num - try the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + dtype_num = PyLong_AsLong(num_obj); + Py_DECREF(num_obj); + + if (PyErr_Occurred()) { + /* num wasn't an integer for some reason - unlikely to happen, but try + * the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + dtype = dtype_num_to_typecode(dtype_num); + if (dtype == -1) { + /* Not a dtype we have in the global lookup table. */ + goto FALLBACK; + } + + /* Fast path, using direct table lookup */ + assert(layout < N_LAYOUT); + assert(ndim <= N_NDIM); + assert(dtype < N_DTYPES); + typecode = cached_arycode[ndim - 1][layout][dtype]; + + if (typecode == -1) { + /* First use of this table entry, so it requires populating */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cached_arycode[ndim - 1][layout][dtype] = typecode; + } + + return typecode; + +FALLBACK: + /* Slower path, for non-trivial array types. At present this always uses + the fingerprinting to get the typecode. Future optimization might + implement a cache, but this would require some fast equivalent of + PyArray_DESCR for a device array. */ + + return typecode_using_fingerprint(dispatcher, (PyObject *) ary); +} + +extern "C" int +typeof_typecode(PyObject *dispatcher, PyObject *val) +{ + int subtype_attr; + /* This needs to be kept in sync with Dispatcher.typeof_pyval(), + * otherwise funny things may happen. + */ + + if (PyLong_Check(val)) { +#if SIZEOF_VOID_P < 8 + /* On 32-bit platforms, choose between tc_intp (32-bit) and tc_int64 */ + PY_LONG_LONG ll = PyLong_AsLongLong(val); + if (ll == -1 && PyErr_Occurred()) { + /* The integer is too large, let us truncate it */ + PyErr_Clear(); + return tc_int64; + } + if ((ll & 0xffffffff) != ll) + return tc_int64; +#endif + return tc_intp; + } + else if (PyFloat_Check(val)) + return tc_float64; + else if (PyComplex_Check(val)) + return tc_complex128; + /* Skip all problematic array checks for now to avoid segfault */ + // /* Array scalar handling */ + // else if (PyArray_CheckScalar(val)) { + // return typecode_arrayscalar(dispatcher, val); + // } + // /* Array handling */ + // else if (tyobj == &PyArray_Type) { + // return typecode_ndarray(dispatcher, (PyArrayObject*)val); + // } + // /* Subtype of CUDA device array */ + // else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) { + // return typecode_devicendarray(dispatcher, val); + // } + // /* Subtypes of Array handling */ + // else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + // /* By default, Numba will treat all numpy.ndarray subtypes as if they + // were the base numpy.ndarray type. In this way, ndarray subtypes + // can easily use all of the support that Numba has for ndarray + // methods. + // EXPERIMENTAL: There may be cases where a programmer would NOT want + // ndarray subtypes to be treated exactly like the base numpy.ndarray. + // For this purpose, a currently experimental feature allows a + // programmer to add an attribute named + // __numba_array_subtype_dispatch__ to their ndarray subtype. This + // attribute can have any value as Numba only checks for the presence + // of the attribute and not its value. When present, a ndarray subtype + // will NOT be typed by Numba as a regular ndarray but this code will + // fallthrough to the typecode_using_fingerprint call, which will + // create a new unique Numba typecode for this ndarray subtype. This + // behavior has several significant effects. First, since this + // ndarray subtype will be treated as a different type by Numba, + // the Numba dispatcher would then specialize on this type. So, if + // there was a function that had several parameters that were + // expected to be either numpy.ndarray or a subtype of ndarray, then + // Numba would compile a custom version of this function for each + // combination of base and subtypes that were actually passed to the + // function. Second, because this subtype would now be treated as + // a totally separate type, it will cease to function in Numba unless + // an implementation of that type is provided to Numba through the + // Numba type extension mechanisms (e.g., overload). This would + // typically start with defining a Numba type corresponding to the + // ndarray subtype. This is the same concept as how Numba has a + // corollary of numpy.ndarray in its type system as types.Array. + // Next, one would typically defining boxing and unboxing routines + // and the associated memory model. Then, overloads for NumPy + // functions on that type would be created. However, + // if the same default array memory model is used then there are tricks + // one can do to look at Numba's internal types.Array registries and + // to quickly apply those to the subtype as well. In this manner, + // only those cases where the base ndarray and the ndarray subtype + // behavior differ would new custom functions need to be written for + // the subtype. Finally, + // after adding support for the new type, you would have a separate + // ndarray subtype that could operate with other objects of the same + // subtype but would not support interoperation with regular NumPy + // ndarrays. In standard Python, this interoperation is provided + // through the __array_ufunc__ magic method in the ndarray subtype + // class and in that case the function operates on ndarrays or their + // subtypes. This idea is extended into Numba such that + // __array_ufunc__ can be present in a Numba array type object. + // In this case, this function is consulted during Numba typing and + // so the arguments to __array_ufunc__ are Numba types instead of + // ndarray subtypes. The array type __array_ufunc__ returns the + // type of the output of the given ufunc. + // */ + // subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__"); + // if (!subtype_attr) { + // return typecode_ndarray(dispatcher, (PyArrayObject*)val); + // } + // } + + return typecode_using_fingerprint(dispatcher, val); +} + + +static +void* wrap_import_array(void) { + import_array(); /* import array returns NULL on failure */ + return (void*)1; +} + + +static +int init_numpy(void) { + return wrap_import_array() != NULL; +} + + +/* + * typeof_init(omittedarg_type, typecode_dict) + * (called from dispatcher.py to fill in missing information) + */ +extern "C" PyObject * +typeof_init(PyObject *self, PyObject *args) +{ + PyObject *tmpobj; + PyObject *dict; + int index = 0; + + if (!PyArg_ParseTuple(args, "O!O!:typeof_init", + &PyType_Type, &omittedarg_type, + &PyDict_Type, &dict)) + return NULL; + + /* Initialize Numpy API */ + if ( ! init_numpy() ) { + return NULL; + } + + #define UNWRAP_TYPE(S) \ + if(!(tmpobj = PyDict_GetItemString(dict, #S))) return NULL; \ + else { tc_##S = PyLong_AsLong(tmpobj); \ + BASIC_TYPECODES[index++] = tc_##S; } + + UNWRAP_TYPE(int8) + UNWRAP_TYPE(int16) + UNWRAP_TYPE(int32) + UNWRAP_TYPE(int64) + + UNWRAP_TYPE(uint8) + UNWRAP_TYPE(uint16) + UNWRAP_TYPE(uint32) + UNWRAP_TYPE(uint64) + + UNWRAP_TYPE(float32) + UNWRAP_TYPE(float64) + + UNWRAP_TYPE(complex64) + UNWRAP_TYPE(complex128) + + switch(sizeof(void*)) { + case 4: + tc_intp = tc_int32; + break; + case 8: + tc_intp = tc_int64; + break; + default: + PyErr_SetString(PyExc_AssertionError, "sizeof(void*) != {4, 8}"); + return NULL; + } + + #undef UNWRAP_TYPE + + typecache = PyDict_New(); + ndarray_typecache = PyDict_New(); + structured_dtypes = PyDict_New(); + if (typecache == NULL || ndarray_typecache == NULL || + structured_dtypes == NULL) { + PyErr_SetString(PyExc_RuntimeError, "failed to create type cache"); + return NULL; + } + + fingerprint_hashtable = _Numba_hashtable_new(sizeof(int), + hash_writer, + compare_writer); + if (fingerprint_hashtable == NULL) { + PyErr_NoMemory(); + return NULL; + } + + /* initialize cached_arycode to all ones (in bits) */ + memset(cached_arycode, 0xFF, sizeof(cached_arycode)); + + str_typeof_pyval = PyString_InternFromString("typeof_pyval"); + str_value = PyString_InternFromString("value"); + str_numba_type = PyString_InternFromString("_numba_type_"); + if (!str_value || !str_typeof_pyval || !str_numba_type) + return NULL; + + Py_RETURN_NONE; +} diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.h b/numba_cuda/numba/cuda/_dispatcher/_typeof.h new file mode 100644 index 000000000..6e0039b5f --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.h @@ -0,0 +1,16 @@ +#ifndef NUMBA_TYPEOF_H_ +#define NUMBA_TYPEOF_H_ + +#ifdef __cplusplus + extern "C" { +#endif + +extern PyObject *typeof_init(PyObject *self, PyObject *args); +extern int typeof_typecode(PyObject *dispatcher, PyObject *val); +extern PyObject *typeof_compute_fingerprint(PyObject *val); + +#ifdef __cplusplus + } +#endif + +#endif /* NUMBA_TYPEOF_H_ */ diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp b/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp new file mode 100644 index 000000000..3c51fdfa7 --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp @@ -0,0 +1,209 @@ +#include +#include +#include +#include + +#include "typeconv.hpp" + + +// ------ TypeManager ------ + +TCCMap::TCCMap() + : nb_records(0) +{ +} + +size_t TCCMap::hash(const TypePair &key) const { + return std::hash()(std::hash()(key.first)) ^ + std::hash()(key.second); +} + +void TCCMap::insert(const TypePair &key, TypeCompatibleCode val) { + size_t i = hash(key) & (TCCMAP_SIZE - 1); + TCCMapBin &bin = records[i]; + for (unsigned int j = 0; j < bin.size(); ++j) { + if (bin[j].key == key) { + bin[j].val = val; + return; + } + } + bin.push_back({key, val}); + nb_records++; +} + +TypeCompatibleCode TCCMap::find(const TypePair &key) const { + size_t i = hash(key) & (TCCMAP_SIZE - 1); + const TCCMapBin &bin = records[i]; + for (unsigned int j = 0; j < bin.size(); ++j) { + if (bin[j].key == key) { + return bin[j].val; + } + } + return TCC_FALSE; +} + +// ----- Ratings ----- +Rating::Rating() : promote(0), safe_convert(0), unsafe_convert(0) { } + +inline bool Rating::operator < (const Rating &other) const { + if (unsafe_convert < other.unsafe_convert) + return true; + else if (unsafe_convert > other.unsafe_convert) + return false; + if (safe_convert < other.safe_convert) + return true; + else if (safe_convert > other.safe_convert) + return false; + return (promote < other.promote); +} + +inline bool Rating::operator == (const Rating &other) const { + return promote == other.promote && safe_convert == other.safe_convert && + unsafe_convert == other.unsafe_convert; +} + +// ------ TypeManager ------ + +bool TypeManager::canPromote(Type from, Type to) const { + return isCompatible(from, to) == TCC_PROMOTE; +} + +bool TypeManager::canSafeConvert(Type from, Type to) const { + return isCompatible(from, to) == TCC_CONVERT_SAFE; +} + +bool TypeManager::canUnsafeConvert(Type from, Type to) const { + return isCompatible(from, to) == TCC_CONVERT_UNSAFE; +} + +void TypeManager::addPromotion(Type from, Type to) { + return addCompatibility(from, to, TCC_PROMOTE); +} + +void TypeManager::addUnsafeConversion(Type from, Type to) { + return addCompatibility(from, to, TCC_CONVERT_UNSAFE); +} + +void TypeManager::addSafeConversion(Type from, Type to) { + return addCompatibility(from, to, TCC_CONVERT_SAFE); +} + +void TypeManager::addCompatibility(Type from, Type to, TypeCompatibleCode tcc) { + TypePair pair(from, to); + tccmap.insert(pair, tcc); +} + +TypeCompatibleCode TypeManager::isCompatible(Type from, Type to) const { + if (from == to) + return TCC_EXACT; + TypePair pair(from, to); + return tccmap.find(pair); +} + + +int TypeManager::selectOverload(const Type sig[], const Type ovsigs[], + int &selected, + int sigsz, int ovct, bool allow_unsafe, + bool exact_match_required + ) const { + int count; + if (ovct <= 16) { + Rating ratings[16]; + int candidates[16]; + count = _selectOverload(sig, ovsigs, selected, sigsz, ovct, + allow_unsafe, exact_match_required, ratings, + candidates); + } + else { + Rating *ratings = new Rating[ovct]; + int *candidates = new int[ovct]; + count = _selectOverload(sig, ovsigs, selected, sigsz, ovct, + allow_unsafe, exact_match_required, ratings, + candidates); + delete [] ratings; + delete [] candidates; + } + return count; +} + +int TypeManager::_selectOverload(const Type sig[], const Type ovsigs[], + int &selected, int sigsz, int ovct, + bool allow_unsafe, bool exact_match_required, + Rating ratings[], int candidates[]) const { + // Generate rating table + // Use a penalize scheme. + int nb_candidates = 0; + + for (int i = 0; i < ovct; ++i) { + const Type *entry = &ovsigs[i * sigsz]; + Rating rate; + + for (int j = 0; j < sigsz; ++j) { + TypeCompatibleCode tcc = isCompatible(sig[j], entry[j]); + if (tcc == TCC_FALSE || + (tcc == TCC_CONVERT_UNSAFE && !allow_unsafe) || + (tcc != TCC_EXACT && exact_match_required)) { + // stop the loop early + goto _incompatible; + } + switch(tcc) { + case TCC_PROMOTE: + rate.promote += 1; + break; + case TCC_CONVERT_SAFE: + rate.safe_convert += 1; + break; + case TCC_CONVERT_UNSAFE: + rate.unsafe_convert += 1; + break; + default: + break; + } + } + ratings[nb_candidates] = rate; + candidates[nb_candidates] = i; + nb_candidates++; + _incompatible: + ; + } + + // Bail if no match + if (nb_candidates == 0) + return 0; + + // Find lowest rating + Rating best = ratings[0]; + selected = candidates[0]; + + int matchcount = 1; + for (int i = 1; i < nb_candidates; ++i) { + if (ratings[i] < best) { + best = ratings[i]; + selected = candidates[i]; + matchcount = 1; + } + else if (ratings[i] == best) { + matchcount += 1; + } + } + return matchcount; +} + +// ----- utils ----- + +const char* TCCString(TypeCompatibleCode tcc) { + switch(tcc) { + case TCC_EXACT: + return "exact"; + case TCC_SUBTYPE: + return "subtype"; + case TCC_PROMOTE: + return "promote"; + case TCC_CONVERT_SAFE: + return "safe_convert"; + case TCC_CONVERT_UNSAFE: + return "unsafe_convert"; + default: + return "false"; + } +} diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp b/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp new file mode 100644 index 000000000..1f3cb9359 --- /dev/null +++ b/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp @@ -0,0 +1,98 @@ +#ifndef NUMBA_TYPECONV_HPP_ +#define NUMBA_TYPECONV_HPP_ +#include +#include + + +typedef int Type; + +enum TypeCompatibleCode{ + // No match + TCC_FALSE = 0, + // Exact match + TCC_EXACT, + // Subtype is UNUSED + TCC_SUBTYPE, + // Promotion with no precision loss + TCC_PROMOTE, + // Conversion with no precision loss + // e.g. int32 to double + TCC_CONVERT_SAFE, + // Conversion with precision loss + // e.g. int64 to double (53 bits precision) + TCC_CONVERT_UNSAFE, +}; + +typedef std::pair TypePair; + +struct TCCRecord { + TypePair key; + TypeCompatibleCode val; +}; + +typedef std::vector TCCMapBin; + +class TCCMap { +public: + TCCMap(); + + void insert(const TypePair &key, TypeCompatibleCode val); + TypeCompatibleCode find(const TypePair &key) const; +private: + size_t hash(const TypePair &key) const; + + /* Must be a power of two */ + static const size_t TCCMAP_SIZE = 512; + TCCMapBin records[TCCMAP_SIZE]; + int nb_records; +}; + +struct Rating { + unsigned int promote; + unsigned int safe_convert; + unsigned int unsafe_convert; + + Rating(); + + bool operator < (const Rating &other) const; + bool operator == (const Rating &other) const; +}; + + +class TypeManager{ +public: + bool canPromote(Type from, Type to) const; + bool canUnsafeConvert(Type from, Type to) const; + bool canSafeConvert(Type from, Type to) const; + + void addPromotion(Type from, Type to); + void addUnsafeConversion(Type from, Type to); + void addSafeConversion(Type from, Type to); + void addCompatibility(Type from, Type to, TypeCompatibleCode by); + + TypeCompatibleCode isCompatible(Type from, Type to) const; + + /** + Output stored in selected. + Returns + Number of matches + */ + int selectOverload(const Type sig[], const Type ovsigs[], int &selected, + int sigsz, int ovct, bool allow_unsafe, + bool exact_match_required + ) const; + +private: + int _selectOverload(const Type sig[], const Type ovsigs[], int &selected, + int sigsz, int ovct, bool allow_unsafe, + bool exact_match_required, + Rating ratings[], int candidates[]) const; + + TCCMap tccmap; +}; + + +const char* TCCString(TypeCompatibleCode tcc); + + +#endif // NUMBA_TYPECONV_HPP_ diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 68706bfd1..d02beffb0 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -38,7 +38,7 @@ from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY from numba import cuda -from numba import _dispatcher +from numba.cuda import _dispatcher from warnings import warn diff --git a/pyproject.toml b/pyproject.toml index 3fd0f65fc..e0c2da629 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ build-backend = "setuptools.build_meta" requires = [ "setuptools", "wheel", + "numpy", ] [project] diff --git a/setup.py b/setup.py index bfb11f27a..31f66ae0e 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,103 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pathlib +import sys -from setuptools import setup +from setuptools import setup, Extension from setuptools.command.build_py import build_py from setuptools.command.editable_wheel import editable_wheel, _TopLevelFinder +from setuptools.command.build_ext import build_ext REDIRECTOR_PTH = "_numba_cuda_redirector.pth" REDIRECTOR_PY = "_numba_cuda_redirector.py" SITE_PACKAGES = pathlib.Path("site-packages") +def get_version(): + """Read version from VERSION file.""" + version_file = pathlib.Path(__file__).parent / "numba_cuda" / "VERSION" + return version_file.read_text().strip() + + +def get_ext_modules(): + """ + Return a list of Extension instances for the setup() call. + """ + # Note we don't import NumPy at the toplevel, since setup.py + # should be able to run without NumPy for pip to discover the + # build dependencies. Need NumPy headers and libm linkage. + import numpy as np + + np_compile_args = { + "include_dirs": [ + np.get_include(), + ], + } + if sys.platform != "win32": + np_compile_args["libraries"] = [ + "m", + ] + + dispatcher_sources = [ + "numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp", + "numba_cuda/numba/cuda/_dispatcher/_typeof.cpp", + "numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp", + "numba_cuda/numba/cuda/_dispatcher/typeconv.cpp", + ] + ext_dispatcher = Extension( + name="numba_cuda.numba.cuda._dispatcher", + sources=dispatcher_sources, + depends=[ + "numba_cuda/numba/cuda/_dispatcher/_pymodule.h", + "numba_cuda/numba/cuda/_dispatcher/_typeof.h", + "numba_cuda/numba/cuda/_dispatcher/_hashtable.h", + ], + extra_compile_args=["-std=c++11"], + **np_compile_args, + ) + + # Add our include directory to the existing include_dirs + ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/_dispatcher") + + return [ext_dispatcher] + + +def is_building(): + """ + Parse the setup.py command and return whether a build is requested. + If False is returned, only an informational command is run. + If True is returned, information about C extensions will have to + be passed to the setup() function. + """ + if len(sys.argv) < 2: + # User forgot to give an argument probably, let setuptools handle that. + return True + + build_commands = [ + "build", + "build_py", + "build_ext", + "build_clibbuild_scripts", + "install", + "install_lib", + "install_headers", + "install_scripts", + "install_data", + "sdist", + "bdist", + "bdist_dumb", + "bdist_rpm", + "bdist_wininst", + "check", + "build_doc", + "bdist_wheel", + "bdist_egg", + "develop", + "easy_install", + "test", + ] + return any(bc in sys.argv[1:] for bc in build_commands) + + # Adapted from https://stackoverflow.com/a/71137790 class build_py_with_redirector(build_py): # noqa: N801 """Include the redirector files in the generated wheel.""" @@ -70,9 +158,52 @@ def _select_strategy(self, name, tag, build_lib): return TopLevelFinderWithRedirector(self.distribution, name) +cmdclass = {} + +numba_be_user_options = [ + ("werror", None, "Build extensions with -Werror"), + ("wall", None, "Build extensions with -Wall"), + ("noopt", None, "Build extensions without optimization"), +] + + +class NumbaBuildExt(build_ext): + user_options = build_ext.user_options + numba_be_user_options + boolean_options = build_ext.boolean_options + ["werror", "wall", "noopt"] + + def initialize_options(self): + super().initialize_options() + self.werror = 0 + self.wall = 0 + self.noopt = 0 + + def run(self): + extra_compile_args = [] + if self.noopt: + if sys.platform == "win32": + extra_compile_args.append("/Od") + else: + extra_compile_args.append("-O0") + if self.werror: + extra_compile_args.append("-Werror") + if self.wall: + extra_compile_args.append("-Wall") + for ext in self.extensions: + ext.extra_compile_args.extend(extra_compile_args) + + super().run() + + +cmdclass["build_ext"] = NumbaBuildExt +cmdclass["build_py"] = build_py_with_redirector +cmdclass["editable_wheel"] = editable_wheel_with_redirector + +if is_building(): + ext_modules = get_ext_modules() +else: + ext_modules = [] + setup( - cmdclass={ - "build_py": build_py_with_redirector, - "editable_wheel": editable_wheel_with_redirector, - } + cmdclass=cmdclass, + ext_modules=ext_modules, ) From e0c2ea8074477436d2c874a6b451d30580b3207f Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Mon, 4 Aug 2025 22:08:29 -0700 Subject: [PATCH 03/62] [Refactor][NFC] _dispatcher cext: Re-enable array handling, invoke typeof_init --- numba_cuda/numba/cuda/_dispatcher/_typeof.cpp | 227 +++++++++--------- numba_cuda/numba/cuda/dispatcher.py | 13 + 2 files changed, 120 insertions(+), 120 deletions(-) diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp index 99a1bd23e..16c5009c4 100644 --- a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp +++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp @@ -366,38 +366,35 @@ compute_fingerprint(string_writer_t *w, PyObject *val) Py_DECREF(default_val); return 0; } - - /* Skip numpy scalar check to prevent segfault */ - // if (PyArray_IsScalar(val, Generic)) { - // PyArray_Descr *descr = PyArray_DescrFromScalar(val); - // if (descr == NULL) - // return -1; - // TRY(string_writer_put_char, w, OP_NP_SCALAR); - // TRY(compute_dtype_fingerprint, w, descr); - // Py_DECREF(descr); - // return 0; - // } - - /* Skip numpy array check to prevent segfault */ - // if (PyArray_Check(val)) { - // PyArrayObject *ary = (PyArrayObject *) val; - // int ndim = PyArray_NDIM(ary); - - // TRY(string_writer_put_char, w, OP_NP_ARRAY); - // TRY(string_writer_put_int32, w, ndim); - // if (PyArray_IS_C_CONTIGUOUS(ary)) - // TRY(string_writer_put_char, w, 'C'); - // else if (PyArray_IS_F_CONTIGUOUS(ary)) - // TRY(string_writer_put_char, w, 'F'); - // else - // TRY(string_writer_put_char, w, 'A'); - // if (PyArray_ISWRITEABLE(ary)) - // TRY(string_writer_put_char, w, 'W'); - // else - // TRY(string_writer_put_char, w, 'R'); - // return compute_dtype_fingerprint(w, PyArray_DESCR(ary)); - // } - + if (PyArray_IsScalar(val, Generic)) { + /* Note: PyArray_DescrFromScalar() may be a bit slow on + non-trivial types. */ + PyArray_Descr *descr = PyArray_DescrFromScalar(val); + if (descr == NULL) + return -1; + TRY(string_writer_put_char, w, OP_NP_SCALAR); + TRY(compute_dtype_fingerprint, w, descr); + Py_DECREF(descr); + return 0; + } + if (PyArray_Check(val)) { + PyArrayObject *ary = (PyArrayObject *) val; + int ndim = PyArray_NDIM(ary); + + TRY(string_writer_put_char, w, OP_NP_ARRAY); + TRY(string_writer_put_int32, w, ndim); + if (PyArray_IS_C_CONTIGUOUS(ary)) + TRY(string_writer_put_char, w, 'C'); + else if (PyArray_IS_F_CONTIGUOUS(ary)) + TRY(string_writer_put_char, w, 'F'); + else + TRY(string_writer_put_char, w, 'A'); + if (PyArray_ISWRITEABLE(ary)) + TRY(string_writer_put_char, w, 'W'); + else + TRY(string_writer_put_char, w, 'R'); + return compute_dtype_fingerprint(w, PyArray_DESCR(ary)); + } if (PyList_Check(val)) { Py_ssize_t n = PyList_GET_SIZE(val); if (n == 0) { @@ -463,12 +460,10 @@ compute_fingerprint(string_writer_t *w, PyObject *val) PyBuffer_Release(&buf); return 0; } - - /* Skip numpy array descriptor check to prevent segfault */ - // if (NUMBA_PyArray_DescrCheck(val)) { - // TRY(string_writer_put_char, w, OP_NP_DTYPE); - // return compute_dtype_fingerprint(w, (PyArray_Descr *) val); - // } + if (NUMBA_PyArray_DescrCheck(val)) { + TRY(string_writer_put_char, w, OP_NP_DTYPE); + return compute_dtype_fingerprint(w, (PyArray_Descr *) val); + } _unrecognized: /* Type not recognized */ @@ -550,7 +545,7 @@ _typecode_fallback(PyObject *dispatcher, PyObject *val, * Note this is done here, not in typeof_typecode(), so that * some values can still benefit from fingerprint caching. */ - if (str_numba_type != NULL && PyObject_HasAttr(val, str_numba_type)) { + if (PyObject_HasAttr(val, str_numba_type)) { numba_type = PyObject_GetAttrString(val, "_numba_type_"); if (!numba_type) return -1; @@ -635,13 +630,6 @@ typecode_using_fingerprint(PyObject *dispatcher, PyObject *val) } return -1; } - - /* Check if hashtable is initialized */ - if (fingerprint_hashtable == NULL) { - string_writer_clear(&w); - return typecode_fallback(dispatcher, val); - } - if (_Numba_HASHTABLE_GET(fingerprint_hashtable, &w, typecode) > 0) { /* Cache hit */ string_writer_clear(&w); @@ -973,12 +961,12 @@ int typecode_devicendarray(PyObject *dispatcher, PyObject *ary) extern "C" int typeof_typecode(PyObject *dispatcher, PyObject *val) { + PyTypeObject *tyobj = Py_TYPE(val); int subtype_attr; /* This needs to be kept in sync with Dispatcher.typeof_pyval(), * otherwise funny things may happen. */ - - if (PyLong_Check(val)) { + if (tyobj == &PyInt_Type || tyobj == &PyLong_Type) { #if SIZEOF_VOID_P < 8 /* On 32-bit platforms, choose between tc_intp (32-bit) and tc_int64 */ PY_LONG_LONG ll = PyLong_AsLongLong(val); @@ -992,80 +980,79 @@ typeof_typecode(PyObject *dispatcher, PyObject *val) #endif return tc_intp; } - else if (PyFloat_Check(val)) + else if (tyobj == &PyFloat_Type) return tc_float64; - else if (PyComplex_Check(val)) + else if (tyobj == &PyComplex_Type) return tc_complex128; - /* Skip all problematic array checks for now to avoid segfault */ - // /* Array scalar handling */ - // else if (PyArray_CheckScalar(val)) { - // return typecode_arrayscalar(dispatcher, val); - // } - // /* Array handling */ - // else if (tyobj == &PyArray_Type) { - // return typecode_ndarray(dispatcher, (PyArrayObject*)val); - // } - // /* Subtype of CUDA device array */ - // else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) { - // return typecode_devicendarray(dispatcher, val); - // } - // /* Subtypes of Array handling */ - // else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { - // /* By default, Numba will treat all numpy.ndarray subtypes as if they - // were the base numpy.ndarray type. In this way, ndarray subtypes - // can easily use all of the support that Numba has for ndarray - // methods. - // EXPERIMENTAL: There may be cases where a programmer would NOT want - // ndarray subtypes to be treated exactly like the base numpy.ndarray. - // For this purpose, a currently experimental feature allows a - // programmer to add an attribute named - // __numba_array_subtype_dispatch__ to their ndarray subtype. This - // attribute can have any value as Numba only checks for the presence - // of the attribute and not its value. When present, a ndarray subtype - // will NOT be typed by Numba as a regular ndarray but this code will - // fallthrough to the typecode_using_fingerprint call, which will - // create a new unique Numba typecode for this ndarray subtype. This - // behavior has several significant effects. First, since this - // ndarray subtype will be treated as a different type by Numba, - // the Numba dispatcher would then specialize on this type. So, if - // there was a function that had several parameters that were - // expected to be either numpy.ndarray or a subtype of ndarray, then - // Numba would compile a custom version of this function for each - // combination of base and subtypes that were actually passed to the - // function. Second, because this subtype would now be treated as - // a totally separate type, it will cease to function in Numba unless - // an implementation of that type is provided to Numba through the - // Numba type extension mechanisms (e.g., overload). This would - // typically start with defining a Numba type corresponding to the - // ndarray subtype. This is the same concept as how Numba has a - // corollary of numpy.ndarray in its type system as types.Array. - // Next, one would typically defining boxing and unboxing routines - // and the associated memory model. Then, overloads for NumPy - // functions on that type would be created. However, - // if the same default array memory model is used then there are tricks - // one can do to look at Numba's internal types.Array registries and - // to quickly apply those to the subtype as well. In this manner, - // only those cases where the base ndarray and the ndarray subtype - // behavior differ would new custom functions need to be written for - // the subtype. Finally, - // after adding support for the new type, you would have a separate - // ndarray subtype that could operate with other objects of the same - // subtype but would not support interoperation with regular NumPy - // ndarrays. In standard Python, this interoperation is provided - // through the __array_ufunc__ magic method in the ndarray subtype - // class and in that case the function operates on ndarrays or their - // subtypes. This idea is extended into Numba such that - // __array_ufunc__ can be present in a Numba array type object. - // In this case, this function is consulted during Numba typing and - // so the arguments to __array_ufunc__ are Numba types instead of - // ndarray subtypes. The array type __array_ufunc__ returns the - // type of the output of the given ufunc. - // */ - // subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__"); - // if (!subtype_attr) { - // return typecode_ndarray(dispatcher, (PyArrayObject*)val); - // } - // } + /* Array scalar handling */ + else if (PyArray_CheckScalar(val)) { + return typecode_arrayscalar(dispatcher, val); + } + /* Array handling */ + else if (tyobj == &PyArray_Type) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + /* Subtype of CUDA device array */ + else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) { + return typecode_devicendarray(dispatcher, val); + } + /* Subtypes of Array handling */ + else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + /* By default, Numba will treat all numpy.ndarray subtypes as if they + were the base numpy.ndarray type. In this way, ndarray subtypes + can easily use all of the support that Numba has for ndarray + methods. + EXPERIMENTAL: There may be cases where a programmer would NOT want + ndarray subtypes to be treated exactly like the base numpy.ndarray. + For this purpose, a currently experimental feature allows a + programmer to add an attribute named + __numba_array_subtype_dispatch__ to their ndarray subtype. This + attribute can have any value as Numba only checks for the presence + of the attribute and not its value. When present, a ndarray subtype + will NOT be typed by Numba as a regular ndarray but this code will + fallthrough to the typecode_using_fingerprint call, which will + create a new unique Numba typecode for this ndarray subtype. This + behavior has several significant effects. First, since this + ndarray subtype will be treated as a different type by Numba, + the Numba dispatcher would then specialize on this type. So, if + there was a function that had several parameters that were + expected to be either numpy.ndarray or a subtype of ndarray, then + Numba would compile a custom version of this function for each + combination of base and subtypes that were actually passed to the + function. Second, because this subtype would now be treated as + a totally separate type, it will cease to function in Numba unless + an implementation of that type is provided to Numba through the + Numba type extension mechanisms (e.g., overload). This would + typically start with defining a Numba type corresponding to the + ndarray subtype. This is the same concept as how Numba has a + corollary of numpy.ndarray in its type system as types.Array. + Next, one would typically defining boxing and unboxing routines + and the associated memory model. Then, overloads for NumPy + functions on that type would be created. However, + if the same default array memory model is used then there are tricks + one can do to look at Numba's internal types.Array registries and + to quickly apply those to the subtype as well. In this manner, + only those cases where the base ndarray and the ndarray subtype + behavior differ would new custom functions need to be written for + the subtype. Finally, + after adding support for the new type, you would have a separate + ndarray subtype that could operate with other objects of the same + subtype but would not support interoperation with regular NumPy + ndarrays. In standard Python, this interoperation is provided + through the __array_ufunc__ magic method in the ndarray subtype + class and in that case the function operates on ndarrays or their + subtypes. This idea is extended into Numba such that + __array_ufunc__ can be present in a Numba array type object. + In this case, this function is consulted during Numba typing and + so the arguments to __array_ufunc__ are Numba types instead of + ndarray subtypes. The array type __array_ufunc__ returns the + type of the output of the given ufunc. + */ + subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__"); + if (!subtype_attr) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + } return typecode_using_fingerprint(dispatcher, val); } diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index d02beffb0..d26304008 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -2136,3 +2136,16 @@ def _reduce_states(self): Compiled definitions are discarded. """ return dict(py_func=self.py_func, targetoptions=self.targetoptions) + + +if config.USE_LEGACY_TYPE_SYSTEM: # Old type system + # Initialize typeof machinery + _dispatcher.typeof_init( + OmittedArg, dict((str(t), t._code) for t in types.number_domain) + ) +else: # New type system + # Initialize typeof machinery + _dispatcher.typeof_init( + OmittedArg, + dict((str(t).split("_")[-1], t._code) for t in types.np_number_domain), + ) From edeed473aa2fd3d8dfa8fa3392cb6cec51654f19 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Mon, 4 Aug 2025 23:19:13 -0700 Subject: [PATCH 04/62] [Refactor][NFC] Trim CUDA _dispatcher cext --- .../numba/cuda/_dispatcher/_dispatcher.cpp | 268 +----------------- .../numba/cuda/_dispatcher/_numba_common.h | 43 --- numba_cuda/numba/cuda/_dispatcher/_typeof.cpp | 3 +- 3 files changed, 5 insertions(+), 309 deletions(-) delete mode 100644 numba_cuda/numba/cuda/_dispatcher/_numba_common.h diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp index 5afdb2880..cbe4b6d7d 100644 --- a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp +++ b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp @@ -388,8 +388,8 @@ else \ typedef std::vector TypeTable; typedef std::vector Functions; -/* The Dispatcher class is the base class of all dispatchers in the CPU and - CUDA targets. Its main responsibilities are: +/* The Dispatcher class is the base class of all dispatchers in the CUDA target. + Its main responsibilities are: - Resolving the best overload to call for a given set of arguments, and - Calling the resolved overload. @@ -662,131 +662,9 @@ int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws) } -#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11)) -/* A custom, fast, inlinable version of PyCFunction_Call() */ -static PyObject * -call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals) -{ - PyCFunctionWithKeywords fn; - PyThreadState *tstate; - - assert(PyCFunction_Check(cfunc)); - assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS)); - fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc); - tstate = PyThreadState_GET(); - -#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11) - /* - * On Python 3.11, _PyEval_EvalFrameDefault stops using PyTraceInfo since - * it's now baked into ThreadState. - * https://github.com/python/cpython/pull/26623 - */ - if (tstate->cframe->use_tracing && tstate->c_profilefunc) -#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10) - /* - * On Python 3.10+ trace_info comes from somewhere up in PyFrameEval et al, - * Numba doesn't have access to that so creates an equivalent struct and - * wires it up against the cframes. This is passed into the tracing - * functions. - * - * Code originally from: - * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L1611-L1622 - */ - PyTraceInfo trace_info; - trace_info.code = NULL; // not initialized - CFrame *prev_cframe = tstate->cframe; - trace_info.cframe.use_tracing = prev_cframe->use_tracing; - trace_info.cframe.previous = prev_cframe; - - if (trace_info.cframe.use_tracing && tstate->c_profilefunc) -#else - /* - * On Python prior to 3.10, tracing state is a member of the threadstate - */ - if (tstate->use_tracing && tstate->c_profilefunc) -#endif - { - /* - * The following code requires some explaining: - * - * We want the jit-compiled function to be visible to the profiler, so we - * need to synthesize a frame for it. - * The PyFrame_New() constructor doesn't do anything with the 'locals' value if the 'code's - * 'CO_NEWLOCALS' flag is set (which is always the case nowadays). - * So, to get local variables into the frame, we have to manually set the 'f_locals' - * member, then call `PyFrame_LocalsToFast`, where a subsequent call to the `frame.f_locals` - * property (by virtue of the `frame_getlocals` function in frameobject.c) will find them. - */ - PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__"); - PyObject *globals = PyDict_New(); - PyObject *builtins = PyEval_GetBuiltins(); - PyFrameObject *frame = NULL; - PyObject *result = NULL; -#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 10)) - // Only used in 3.10, to help with saving/restoring exception state - PyObject *pyexc = NULL; - PyObject *err_type = NULL; - PyObject *err_value = NULL; - PyObject *err_traceback = NULL; -#endif - if (!code) { - PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found."); - goto error; - } - /* Populate builtins, which is required by some JITted functions */ - if (PyDict_SetItemString(globals, "__builtins__", builtins)) { - goto error; - } - - /* unset the CO_OPTIMIZED flag, make the frame get a new locals dict */ - code->co_flags &= 0xFFFE; - - frame = PyFrame_New(tstate, code, globals, locals); - if (frame == NULL) { - goto error; - } -#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11) - // Python 3.11 improved the frame infrastructure such that frames are - // updated by the virtual machine, no need to do PyFrame_LocalsToFast - // and PyFrame_FastToLocals to ensure `frame->f_locals` is consistent. - C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws), frame); -#else - // Populate the 'fast locals' in `frame` - PyFrame_LocalsToFast(frame, 0); - tstate->frame = frame; - - // make the call - C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws)); - - // write changes back to locals? - // PyFrame_FastToLocals can clear the exception indicator, therefore - // this state needs saving and restoring across the call if the - // exception indicator is set. - pyexc = PyErr_Occurred(); - if (pyexc != NULL) { - PyErr_Fetch(&err_type, &err_value, &err_traceback); - } - PyFrame_FastToLocals(frame); - if (pyexc != NULL) { - PyErr_Restore(err_type, err_value, err_traceback); - } - tstate->frame = frame->f_back; -#endif - error: - Py_XDECREF(frame); - Py_XDECREF(globals); - Py_XDECREF(code); - return result; - } - else - { - return fn(PyCFunction_GET_SELF(cfunc), args, kws); - } -} - -#elif (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) +#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) // Python 3.12 has a completely new approach to tracing and profiling due to // the new `sys.monitoring` system. @@ -1124,35 +1002,7 @@ call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyO #endif -static -PyObject* -compile_and_invoke(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals) -{ - /* Compile a new one */ - PyObject *cfa, *cfunc, *retval; - cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args"); - if (cfa == NULL) - return NULL; - - /* NOTE: we call the compiled function ourselves instead of - letting the Python derived class do it. This is for proper - behaviour of globals() in jitted functions (issue #476). */ - cfunc = PyObject_Call(cfa, args, kws); - Py_DECREF(cfa); - - if (cfunc == NULL) - return NULL; - - if (PyObject_TypeCheck(cfunc, &PyCFunction_Type)) { - retval = call_cfunc(self, cfunc, args, kws, locals); - } else { - /* Re-enter interpreter */ - retval = PyObject_Call(cfunc, args, kws); - } - Py_DECREF(cfunc); - return retval; -} /* A copy of compile_and_invoke, that only compiles. This is needed for CUDA * kernels, because its overloads are Python instances of the _Kernel class, @@ -1288,117 +1138,7 @@ find_named_args(Dispatcher *self, PyObject **pargs, PyObject **pkws) return 0; } -static PyObject* -Dispatcher_call(Dispatcher *self, PyObject *args, PyObject *kws) -{ - PyObject *tmptype, *retval = NULL; - int *tys = NULL; - int argct; - int i; - int prealloc[24]; - int matches; - PyObject *cfunc; - PyThreadState *ts = PyThreadState_Get(); - PyObject *locals = NULL; - - /* If compilation is enabled, ensure that an exact match is found and if - * not compile one */ - int exact_match_required = self->can_compile ? 1 : self->exact_match_required; - -#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) - if (ts->tracing && ts->c_profilefunc) { -#else - if (ts->use_tracing && ts->c_profilefunc) { -#endif - locals = PyEval_GetLocals(); - if (locals == NULL) { - goto CLEANUP; - } - } - if (self->fold_args) { - if (find_named_args(self, &args, &kws)) - return NULL; - } - else - Py_INCREF(args); - /* Now we own a reference to args */ - - argct = PySequence_Fast_GET_SIZE(args); - - if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int))) - tys = prealloc; - else - tys = new int[argct]; - - for (i = 0; i < argct; ++i) { - tmptype = PySequence_Fast_GET_ITEM(args, i); - tys[i] = typeof_typecode((PyObject *) self, tmptype); - if (tys[i] == -1) { - if (self->can_fallback){ - /* We will clear the exception if fallback is allowed. */ - PyErr_Clear(); - } else { - goto CLEANUP; - } - } - } - - /* We only allow unsafe conversions if compilation of new specializations - has been disabled. - - Note that the number of matches is returned in matches by resolve, which - accepts it as a reference. */ - cfunc = self->resolve(tys, matches, !self->can_compile, - exact_match_required); - - if (matches == 0 && !self->can_compile) { - /* - * If we can't compile a new specialization, look for - * matching signatures for which conversions haven't been - * registered on the C++ TypeManager. - */ - int res = search_new_conversions((PyObject *) self, args, kws); - if (res < 0) { - retval = NULL; - goto CLEANUP; - } - if (res > 0) { - /* Retry with the newly registered conversions */ - cfunc = self->resolve(tys, matches, !self->can_compile, - exact_match_required); - } - } - if (matches == 1) { - /* Definition is found */ - retval = call_cfunc(self, cfunc, args, kws, locals); - } else if (matches == 0) { - /* No matching definition */ - if (self->can_compile) { - retval = compile_and_invoke(self, args, kws, locals); - } else if (self->fallbackdef) { - /* Have object fallback */ - retval = call_cfunc(self, self->fallbackdef, args, kws, locals); - } else { - /* Raise TypeError */ - explain_matching_error((PyObject *) self, args, kws); - retval = NULL; - } - } else if (self->can_compile) { - /* Ambiguous, but are allowed to compile */ - retval = compile_and_invoke(self, args, kws, locals); - } else { - /* Ambiguous */ - explain_ambiguous((PyObject *) self, args, kws); - retval = NULL; - } - -CLEANUP: - if (tys != prealloc) - delete[] tys; - Py_DECREF(args); - return retval; -} /* Based on Dispatcher_call above, with the following differences: 1. It does not invoke the definition of the function. @@ -1565,7 +1305,7 @@ static PyTypeObject DispatcherType = { 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ - (PyCFunctionWithKeywords)Dispatcher_call, /* tp_call*/ + 0, /* tp_call*/ 0, /* tp_str*/ 0, /* tp_getattro*/ 0, /* tp_setattro*/ diff --git a/numba_cuda/numba/cuda/_dispatcher/_numba_common.h b/numba_cuda/numba/cuda/_dispatcher/_numba_common.h deleted file mode 100644 index d458e4240..000000000 --- a/numba_cuda/numba/cuda/_dispatcher/_numba_common.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef NUMBA_COMMON_H_ -#define NUMBA_COMMON_H_ - -/* __has_attribute() is a clang / gcc-5 macro */ -#ifndef __has_attribute -# define __has_attribute(x) 0 -#endif - -/* This attribute marks symbols that can be shared across C objects - * but are not exposed outside of a shared library or executable. - * Note this is default behaviour for global symbols under Windows. - */ -#if defined(_MSC_VER) - #define VISIBILITY_HIDDEN - #define VISIBILITY_GLOBAL __declspec(dllexport) -#elif (__has_attribute(visibility) || (defined(__GNUC__) && __GNUC__ >= 4)) - #define VISIBILITY_HIDDEN __attribute__ ((visibility("hidden"))) - #define VISIBILITY_GLOBAL __attribute__ ((visibility("default"))) -#else - #define VISIBILITY_HIDDEN - #define VISIBILITY_GLOBAL -#endif - -/* - * Numba's version of the PyArray_DescrCheck macro from NumPy, use it as a - * direct replacement of NumPy's PyArray_DescrCheck to ensure binary - * compatibility. - * - * Details of why this is needed: - * NumPy 1.18 changed the definition of the PyArray_DescrCheck macro here: - * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698 - * the result of this being that building against NumPy <1.18 would prevent - * Numba running against NumPy >= 1.20 as noted here: - * https://github.com/numba/numba/issues/6041#issuecomment-665132199 - * - * This macro definition is copied from: - * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698#diff-ad2213da23136c5fc5883d9eb2d88666R26 - * - * NOTE: This is the NumPy 1.18 and above version of the macro. - */ -#define NUMBA_PyArray_DescrCheck(op) PyObject_TypeCheck(op, &PyArrayDescr_Type) - -#endif /* NUMBA_COMMON_H_ */ diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp index 16c5009c4..4f03d0070 100644 --- a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp +++ b/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp @@ -4,7 +4,6 @@ #include #include -#include "_numba_common.h" #include "_typeof.h" #include "_hashtable.h" #include "_devicearray.h" @@ -460,7 +459,7 @@ compute_fingerprint(string_writer_t *w, PyObject *val) PyBuffer_Release(&buf); return 0; } - if (NUMBA_PyArray_DescrCheck(val)) { + if (PyObject_TypeCheck(val, &PyArrayDescr_Type)) { TRY(string_writer_put_char, w, OP_NP_DTYPE); return compute_dtype_fingerprint(w, (PyArray_Descr *) val); } From df14badf16804c01c954f92a7ba127db41abd3f4 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 5 Aug 2025 10:35:29 -0700 Subject: [PATCH 05/62] [Refactor][NFC] Remove sysmon from _dispatcher cext --- numba_cuda/numba/cuda/__init__.py | 1 - .../numba/cuda/_dispatcher/_dispatcher.cpp | 337 +----------------- numba_cuda/numba/cuda/dispatcher.py | 1 - pyproject.toml | 1 - 4 files changed, 3 insertions(+), 337 deletions(-) diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py index 607ef2e09..0465e1903 100644 --- a/numba_cuda/numba/cuda/__init__.py +++ b/numba_cuda/numba/cuda/__init__.py @@ -1,6 +1,5 @@ import importlib from numba.core import config -from numba.core.config import ENABLE_SYS_MONITORING from .utils import _readenv import warnings diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp index cbe4b6d7d..46cb7888a 100644 --- a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp +++ b/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp @@ -403,8 +403,6 @@ class Dispatcher { PyObject_HEAD /* Whether compilation of new overloads is permitted */ char can_compile; - /* Enable sys.monitoring (since Python 3.12+) */ - char enable_sysmon; /* Whether fallback to object mode is permitted */ char can_fallback; /* Whether types must match exactly when resolving overloads. @@ -534,7 +532,6 @@ Dispatcher_init(Dispatcher *self, PyObject *args, PyObject *kwds) self->tm = static_cast(tmaddr); self->argct = argct; self->can_compile = 1; - self->enable_sysmon = 0; // default to turn off sys.monitoring self->can_fallback = can_fallback; self->fallbackdef = NULL; self->has_stararg = has_stararg; @@ -662,344 +659,22 @@ int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws) } - - -#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) - -// Python 3.12 has a completely new approach to tracing and profiling due to -// the new `sys.monitoring` system. - -// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L863-L868 - -static const int8_t MOST_SIG_BIT[16] = {-1, 0, 1, 1, - 2, 2, 2, 2, - 3, 3, 3, 3, - 3, 3, 3, 3}; - -// From: https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L873-L879 - -static inline int msb(uint8_t bits) { - if (bits > 15) { - return MOST_SIG_BIT[bits>>4]+4; - } - return MOST_SIG_BIT[bits]; -} - - -static int invoke_monitoring(PyThreadState * tstate, int event, Dispatcher *self, PyObject* retval) -{ - // This will invoke monitoring tools (if present) for the event `event`. - // - // Arguments: - // tstate - the interpreter thread state - // event - an event as defined in internal/pycore_instruments.h - // self - the dispatcher - // retval - the return value from running the dispatcher machine code (if needed) - // or NULL if not needed. - // - // Return: - // status 0 for success -1 otherwise. - // - // Notes: - // Python 3.12 has a new monitoring system as described in PEP 669. It's - // largely implemented in CPython PR #103083. - // - // This PEP manifests as a set of monitoring instrumentation in the form of - // per-monitoring-tool-type callbacks stored as part of the interpreter - // state (can also be on the code object for "local events" but Numba - // doesn't support those, see the Numba developer docs). From the Python - // interpreter this appears as `sys.monitoring`, from the C-side there's not - // a great deal of public API for the sort of things that Numba wants/needs - // to do. - // - // The new monitoring system is event based, the general idea in the - // following code is to see if a monitoring "tool" has registered a callback - // to run on the presence of a particular event and run those callbacks if - // so. In Numba's case we're just about to disappear into machine code - // that's essentially doing the same thing as the interpreter would if it - // executed the bytecode present in the function that's been JIT compiled. - // As a result we need to tell any tool that has a callback registered for a - // PY_MONITORING_EVENT_PY_START that a Python function is about to start - // (and do something similar for when a function returns/raises). - // This is a total lie as the execution is in machine code, but telling this - // lie makes it look like a python function has started executing at the - // point the machine code function starts and tools like profilers will be - // able to identify this and do something appropriate. The "lie" is very - // much like lie told for Python < 3.12, but the format of the lie is - // different. There is no fake frame involved, it's just about calling an - // appropriate call back, which in a way is a lot less confusing to deal - // with. - // - // For reference, under cProfile all these are NULL, don't even look at - // them, they are legacy, you need to use the monitoring system! - // tstate->c_profilefunc - // tstate->c_profileobj - // tstate->c_tracefunc - // tstate->c_traceobj - // - // Finally: Useful places to look in the CPython code base: - // 1. internal/pycore_instruments.h which has the #defines for all the event - // types and the "types" of tools e.g. debugger, profiler. - // 2. Python/instrumentation.c which is where most of the implementation is - // done. Particularly functions `call_instrumentation_vector` and - // `call_one_instrument`. - // Note that Python/legacy_tracing.c is not somewhere to look, it's just - // wiring old style tracing that has been setup via e.g. C-API - // PyEval_SetProfile into the new monitoring system. - // - // Other things... - // 1. Calls to `sys.monitoring.set_events` clobber the previous state. - // 2. You can register callbacks for an event without having the event set. - // 3. You can set events and have no associated callback. - // 4. Tools are supposed to be respectful of other tools that are - // registered, i.e. not clobber/interfere with each other. - // 5. There are multiple slots for tools, cProfile is a profiler and - // profilers should register in slot 2 by convention. - // - // This is useful for debug: - // To detect whether Python is doing _any_ monitoring it's necessary to - // inspect the per-thread state interpreter monitors.tools member, its a - // uchar[15]. A non-zero value in any tools slot suggests something - // is registered to be called on the occurence of some event. - // - // bool monitoring_tools_present = false; - // for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { - // if (tstate->interp->monitors.tools[i]) { - // monitoring_tools_present = true; - // break; - // } - // } - - // The code in this function is based loosely on a combination of the - // following: - // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L945-L1008 - // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L1010-L1026 - // https://github.com/python/cpython/blob/0ab2384c5f56625e99bb35417cadddfe24d347e1/Python/instrumentation.c#L839-L861 - - // TODO: check this, call_instrumentation_vector has this at the top. - if (tstate->tracing){ - return 0; - } - - // Are there any tools set on this thead for this event? - uint8_t tools = tstate->interp->monitors.tools[event]; - // offset value for use in callbacks - PyObject * offset_obj = NULL; - // callback args slots (used in vectorcall protocol) - PyObject * callback_args[3] = {NULL, NULL, NULL}; - - // If so... - if (tools) - { - - - PyObject *result = NULL; - PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__"); // incref code - if (!code) { - PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found."); - return -1; - } - - // TODO: handle local events, see `get_tools_for_instruction`. - // The issue with local events is that they maybe don't make a lot of - // sense in a JIT context. The way it works is that - // `sys.monitoring.set_local_events` takes the code object of a function - // and "instruments" it with respect to the requested events. In - // practice this seems to materialise as swapping bytecodes associated - // with the event bitmask for `INSTRUMENTED_` variants of those - // bytecodes. Then at interpretation time if an instrumented instruction - // is encountered it triggers lookups in the `code->_co_monitoring` - // struct for tools and active monitors etc. In Numba we _know_ the - // bytecode at which the code starts and we can probably scrape the code - // to look for instrumented return instructions, so it is feasible to - // support at least PY_START and PY_RETURN events, however, it's a lot - // of effort for perhaps something that's practically not that useful. - // As a result, only global events are supported at present. - - // This is supposed to be the offset of the - // currently-being-interpreted bytecode instruction. In Numba's case - // there is no bytecode executing. We know that for a PY_START event - // that the offset is probably zero (it might be 2 if there's a - // closure, it's whereever the `RESUME` bytecode appears). However, - // we don't know which bytecode will be associated with the return - // (without huge effort to wire that through to here). Therefore - // zero is also used for return/raise/unwind, the main use case, - // cProfile, seems to manage to do something sensible even though this - // is inaccurate. - offset_obj = PyLong_FromSsize_t(0); // incref offset_obj - - // This is adapted from call_one_instrument. Note that Numba has to care - // about all events even though it only emits fake events for PY_START, - // PY_RETURN, RAISE and PY_UNWIND, this is because of the ability of - // `objmode` to call back into the interpreter and essentially create a - // continued Python execution environment/stack from there. - while(tools) { - // The tools registered are set as bits in `tools` and provide an - // index into monitoring_callables. This is presumably used by - // cPython to detect if the slot of a tool type is already in use so - // that a user can't register more than one tool of a given type at - // the same time. - int tool = msb(tools); - tools ^= (1 << tool); - // Get the instrument at offset `tool` for the event of interest, - // this is a callback function, it also might not be present! It - // is entirely legitimate to have events that have no callback - // and callbacks that have no event. This is to make it relatively - // easy to switch events on and off and ensure that monitoring is - // "lightweight". - PyObject * instrument = (PyObject *)tstate->interp->monitoring_callables[tool][event]; - if (instrument == NULL){ - continue; - } - - // Swap the threadstate "event" for the event of interest and - // increment the tracing tracking field (essentially, inlined - // PyThreadState_EnterTracing). - int old_what = tstate->what_event; - tstate->what_event = event; - tstate->tracing++; - - // Need to call the callback instrument. Need to know the number of - // arguments, this is based on whether the `retval` (return value) - // is NULL (it indicates whether this is a PY_START, or something - // like a PY_RETURN, which has 3 arguments). - size_t nargsf = (retval == NULL ? 2 : 3) | PY_VECTORCALL_ARGUMENTS_OFFSET; - - // call the instrumentation, look at the args to the callback - // functions for sys.monitoring events to find out what the - // arguments are. e.g. - // PY_START has `func(code: CodeType, instruction_offset: int)` - // whereas - // PY_RETURN has `func(code: CodeType, instruction_offset: int, retval: object)` - // and - // CALL, C_RAISE, C_RETURN has `func(code: CodeType, instruction_offset: int, callable: object, arg0 object|MISSING)` - // i.e. the signature changes based on context. This influences the - // value of `nargsf` and what is wired into `callback_args`. First two - // arguments are always code and offset, optional third arg is - // the return value. - callback_args[0] = (PyObject*)code; - callback_args[1] = (PyObject*)offset_obj; - callback_args[2] = (PyObject*)retval; - PyObject ** callargs = &callback_args[0]; - - // finally, stage the call the the instrument - result = PyObject_Vectorcall(instrument, callargs, nargsf, NULL); - - // decrement the tracing tracking field and set the event back to - // the original event (essentially, inlined - // PyThreadState_LeaveTracing). - tstate->tracing--; - tstate->what_event = old_what; - - if (result == NULL){ - // Error occurred in call to instrumentation. - Py_XDECREF(offset_obj); - Py_XDECREF(code); - return -1; - } - } - Py_XDECREF(offset_obj); - Py_XDECREF(code); - } - return 0; -} - -/* invoke monitoring for PY_START if it is set */ -int static inline invoke_monitoring_PY_START(PyThreadState * tstate, Dispatcher *self) { - return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_START, self, NULL); -} - -/* invoke monitoring for PY_RETURN if it is set */ -int static inline invoke_monitoring_PY_RETURN(PyThreadState * tstate, Dispatcher *self, PyObject * retval) { - return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_RETURN, self, retval); -} - -/* invoke monitoring for RAISE if it is set */ -int static inline invoke_monitoring_RAISE(PyThreadState * tstate, Dispatcher *self, PyObject * exception) { - return invoke_monitoring(tstate, PY_MONITORING_EVENT_RAISE, self, exception); -} - -/* invoke monitoring for PY_UNWIND if it is set */ -int static inline invoke_monitoring_PY_UNWIND(PyThreadState * tstate, Dispatcher *self, PyObject * exception) { - return invoke_monitoring(tstate, PY_MONITORING_EVENT_PY_UNWIND, self, exception); -} - -/* forward declaration */ -bool static is_sysmon_enabled(Dispatcher *self); - /* A custom, fast, inlinable version of PyCFunction_Call() */ static PyObject * call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals) { PyCFunctionWithKeywords fn = NULL; - PyThreadState *tstate = NULL; PyObject * pyresult = NULL; - PyObject * pyexception = NULL; - const bool enabled_sysmon = is_sysmon_enabled(self); assert(PyCFunction_Check(cfunc)); assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS)); fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc); - tstate = PyThreadState_GET(); - // issue PY_START if event is set - if(enabled_sysmon && invoke_monitoring_PY_START(tstate, self) != 0){ - return NULL; - } + // make call pyresult = fn(PyCFunction_GET_SELF(cfunc), args, kws); - if (enabled_sysmon && pyresult == NULL) { - // pyresult == NULL, which means the Numba function raised an exception - // which is now pending. - // - // NOTE: that _ALL_ exceptions trigger the RAISE event, even a - // StopIteration exception. To get a STOP_ITERATION event, the - // StopIteration exception must be "implied" i.e. a for loop exhausting - // a generator, whereas those coming from the executing the binary - // wrapped in this dispatcher must always be explicit (this is after all - // a function dispatcher). - // - // NOTE: That it is necessary to trigger both a `RAISE` event, as this - // triggered by an exception being raised, and a `PY_UNWIND` event, as - // this is the event for "exiting from a python function during - // exception unwinding" (see CPython sys.monitoring docs). - // - // In the following, if the call to PyErr_GetRaisedException returns - // NULL, it means that something has cleared the error indicator and - // this is a most surprising state to occur (shouldn't be possible!). - // - // TODO: This makes the exception raising path a little slower as the - // exception state is suspended and resumed regardless of whether - // monitoring for such an event is set. In future it might be worth - // checking the tstate->interp->monitors.tools[event] and only doing the - // suspend/resume if something is listening for the event. - pyexception = PyErr_GetRaisedException(); - if (pyexception != NULL) { - if(invoke_monitoring_RAISE(tstate, self, pyexception) != 0){ - // If the monitoring callback raised, return NULL so that the - // exception can propagate. - return NULL; - } - if(invoke_monitoring_PY_UNWIND(tstate, self, pyexception) != 0){ - // If the monitoring callback raised, return NULL so that the - // exception can propagate. - return NULL; - } - // reset the exception - PyErr_SetRaisedException(pyexception); - } - // Exception in Numba call as pyresult == NULL, start to unwind by - // returning NULL. - return NULL; - } - // issue PY_RETURN if event is set - if(enabled_sysmon && invoke_monitoring_PY_RETURN(tstate, self, pyresult) != 0){ - return NULL; - } + return pyresult; } -#else -#error "Python version is not supported." -#endif @@ -1285,7 +960,6 @@ static PyMethodDef Dispatcher_methods[] = { static PyMemberDef Dispatcher_members[] = { {(char*)"_can_compile", T_BOOL, offsetof(Dispatcher, can_compile), 0, NULL }, - {(char*)"_enable_sysmon", T_BOOL, offsetof(Dispatcher, enable_sysmon), 0, NULL }, {NULL} /* Sentinel */ }; @@ -1360,12 +1034,7 @@ static PyTypeObject DispatcherType = { }; -#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) -static -bool is_sysmon_enabled(Dispatcher * self) { - return self->enable_sysmon; -} -#endif + static PyObject *compute_fingerprint(PyObject *self, PyObject *args) { diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index d26304008..4afcbbde7 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -820,7 +820,6 @@ def __init__( self.doc = py_func.__doc__ self._compiling_counter = CompilingCounter() - self._enable_sysmon = bool(config.ENABLE_SYS_MONITORING) weakref.finalize(self, self._make_finalizer()) def _compilation_chain_init_hook(self): diff --git a/pyproject.toml b/pyproject.toml index e0c2da629..3fd0f65fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,6 @@ build-backend = "setuptools.build_meta" requires = [ "setuptools", "wheel", - "numpy", ] [project] From 0f7c9a7d2f263f44bd1c9ac95611e26872843f65 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Wed, 6 Aug 2025 09:31:31 -0700 Subject: [PATCH 06/62] [Refactor][NFC] Changes to build _dispatcher cext with pip install -e . --- numba_cuda/numba/cuda/dispatcher.py | 15 ++++----------- pyproject.toml | 1 + setup.py | 1 + 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 4afcbbde7..a88093e31 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -2137,14 +2137,7 @@ def _reduce_states(self): return dict(py_func=self.py_func, targetoptions=self.targetoptions) -if config.USE_LEGACY_TYPE_SYSTEM: # Old type system - # Initialize typeof machinery - _dispatcher.typeof_init( - OmittedArg, dict((str(t), t._code) for t in types.number_domain) - ) -else: # New type system - # Initialize typeof machinery - _dispatcher.typeof_init( - OmittedArg, - dict((str(t).split("_")[-1], t._code) for t in types.np_number_domain), - ) +# Initialize typeof machinery +_dispatcher.typeof_init( + OmittedArg, dict((str(t), t._code) for t in types.number_domain) +) diff --git a/pyproject.toml b/pyproject.toml index 3fd0f65fc..e0c2da629 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ build-backend = "setuptools.build_meta" requires = [ "setuptools", "wheel", + "numpy", ] [project] diff --git a/setup.py b/setup.py index 31f66ae0e..c4a4b671b 100644 --- a/setup.py +++ b/setup.py @@ -94,6 +94,7 @@ def is_building(): "develop", "easy_install", "test", + "editable_wheel", ] return any(bc in sys.argv[1:] for bc in build_commands) From 21fd5068fe18ed08ad9ff0fdedf1c9aa2e95c667 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Wed, 6 Aug 2025 10:45:21 -0700 Subject: [PATCH 07/62] Add numpy build dep to conda/recipes/numba-cuda --- conda/recipes/numba-cuda/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index 5aecbfcb7..cba6deb08 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -25,6 +25,7 @@ requirements: - python - pip - setuptools + - numpy run: - python - numba >=0.59.1 From 3bcdaad669d86f4b9b31255de1ae8002070edc51 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Fri, 8 Aug 2025 11:41:02 -0700 Subject: [PATCH 08/62] [Refactor][NFC] Add _devicearray, mviewbuf cext, move all cexts to top level numba_cuda --- numba_cuda/cext/_devicearray.cpp | 156 +++++++ .../cuda/_dispatcher => cext}/_devicearray.h | 0 .../cuda/_dispatcher => cext}/_dispatcher.cpp | 7 +- .../cuda/_dispatcher => cext}/_hashtable.cpp | 0 .../cuda/_dispatcher => cext}/_hashtable.h | 0 .../cuda/_dispatcher => cext}/_pymodule.h | 0 .../cuda/_dispatcher => cext}/_typeof.cpp | 0 .../cuda/_dispatcher => cext}/_typeof.h | 0 numba_cuda/cext/mviewbuf.c | 382 ++++++++++++++++++ .../cuda/_dispatcher => cext}/typeconv.cpp | 0 .../cuda/_dispatcher => cext}/typeconv.hpp | 0 numba_cuda/numba/cuda/cudadrv/devicearray.py | 2 +- numba_cuda/numba/cuda/cudadrv/driver.py | 2 +- numba_cuda/numba/cuda/dispatcher.py | 2 +- setup.py | 43 +- 15 files changed, 577 insertions(+), 17 deletions(-) create mode 100644 numba_cuda/cext/_devicearray.cpp rename numba_cuda/{numba/cuda/_dispatcher => cext}/_devicearray.h (100%) rename numba_cuda/{numba/cuda/_dispatcher => cext}/_dispatcher.cpp (99%) rename numba_cuda/{numba/cuda/_dispatcher => cext}/_hashtable.cpp (100%) rename numba_cuda/{numba/cuda/_dispatcher => cext}/_hashtable.h (100%) rename numba_cuda/{numba/cuda/_dispatcher => cext}/_pymodule.h (100%) rename numba_cuda/{numba/cuda/_dispatcher => cext}/_typeof.cpp (100%) rename numba_cuda/{numba/cuda/_dispatcher => cext}/_typeof.h (100%) create mode 100644 numba_cuda/cext/mviewbuf.c rename numba_cuda/{numba/cuda/_dispatcher => cext}/typeconv.cpp (100%) rename numba_cuda/{numba/cuda/_dispatcher => cext}/typeconv.hpp (100%) diff --git a/numba_cuda/cext/_devicearray.cpp b/numba_cuda/cext/_devicearray.cpp new file mode 100644 index 000000000..a822c6261 --- /dev/null +++ b/numba_cuda/cext/_devicearray.cpp @@ -0,0 +1,156 @@ +/* This file contains the base class implementation for all device arrays. The + * base class is implemented in C so that computing typecodes for device arrays + * can be implemented efficiently. */ + +#include "_pymodule.h" + + +/* Include _devicearray., but make sure we don't get the definitions intended + * for consumers of the Device Array API. + */ +#define NUMBA_IN_DEVICEARRAY_CPP_ +#include "_devicearray.h" + +/* DeviceArray PyObject implementation. Note that adding more members here is + * presently prohibited because mapped and managed arrays derive from both + * DeviceArray and NumPy's ndarray, which is also a C extension class - the + * layout of the object cannot be resolved if this class also has members beyond + * PyObject_HEAD. */ +class DeviceArray { + PyObject_HEAD +}; + +/* Trivial traversal - DeviceArray instances own nothing. */ +static int +DeviceArray_traverse(DeviceArray *self, visitproc visit, void *arg) +{ + return 0; +} + +/* Trivial clear of all references - DeviceArray instances own nothing. */ +static int +DeviceArray_clear(DeviceArray *self) +{ + return 0; +} + +/* The _devicearray.DeviceArray type */ +PyTypeObject DeviceArrayType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_devicearray.DeviceArray", /* tp_name */ + sizeof(DeviceArray), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, + /* tp_flags*/ + "DeviceArray object", /* tp_doc */ + (traverseproc) DeviceArray_traverse, /* tp_traverse */ + (inquiry) DeviceArray_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ + 0, /* tp_vectorcall */ +#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12) +/* This was introduced first in 3.12 + * https://github.com/python/cpython/issues/91051 + */ + 0, /* tp_watched */ +#endif + +/* WARNING: Do not remove this, only modify it! It is a version guard to + * act as a reminder to update this struct on Python version update! */ +#if (PY_MAJOR_VERSION == 3) +#if ! (NB_SUPPORTED_PYTHON_MINOR) +#error "Python minor version is not supported." +#endif +#else +#error "Python major version is not supported." +#endif +/* END WARNING*/ +}; + +/* CUDA device array C API */ +static void *_DeviceArray_API[1] = { + (void*)&DeviceArrayType +}; + +MOD_INIT(_devicearray) { + PyObject *m = nullptr; + PyObject *d = nullptr; + PyObject *c_api = nullptr; + int error = 0; + + MOD_DEF(m, "_devicearray", "No docs", NULL) + if (m == NULL) + goto error_occurred; + + c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda.cext._devicearray._DEVICEARRAY_API", NULL); + if (c_api == NULL) + goto error_occurred; + + DeviceArrayType.tp_new = PyType_GenericNew; + if (PyType_Ready(&DeviceArrayType) < 0) + goto error_occurred; + + Py_INCREF(&DeviceArrayType); + error = PyModule_AddObject(m, "DeviceArray", (PyObject*)(&DeviceArrayType)); + if (error) + goto error_occurred; + + d = PyModule_GetDict(m); + if (d == NULL) + goto error_occurred; + + error = PyDict_SetItemString(d, "_DEVICEARRAY_API", c_api); + /* Decref and set c_api to NULL, Py_XDECREF in error_occurred will have no + * effect. */ + Py_CLEAR(c_api); + + if (error) + goto error_occurred; + + return MOD_SUCCESS_VAL(m); + +error_occurred: + Py_XDECREF(m); + Py_XDECREF(c_api); + Py_XDECREF((PyObject*)&DeviceArrayType); + + return MOD_ERROR_VAL; +} diff --git a/numba_cuda/numba/cuda/_dispatcher/_devicearray.h b/numba_cuda/cext/_devicearray.h similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/_devicearray.h rename to numba_cuda/cext/_devicearray.h diff --git a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp b/numba_cuda/cext/_dispatcher.cpp similarity index 99% rename from numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp rename to numba_cuda/cext/_dispatcher.cpp index 46cb7888a..b2f3e1c54 100644 --- a/numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp +++ b/numba_cuda/cext/_dispatcher.cpp @@ -935,13 +935,14 @@ Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws) static int import_devicearray(void) { - PyObject *devicearray = PyImport_ImportModule("numba._devicearray"); + // Import our vendored devicearray (following original numba's pattern) + PyObject *devicearray = PyImport_ImportModule("numba_cuda.cext._devicearray"); if (devicearray == NULL) { return -1; } Py_DECREF(devicearray); - DeviceArray_API = (void**)PyCapsule_Import("numba._devicearray._DEVICEARRAY_API", 0); + DeviceArray_API = (void**)PyCapsule_Import("numba_cuda.cext._devicearray._DEVICEARRAY_API", 0); if (DeviceArray_API == NULL) { return -1; } @@ -1056,7 +1057,7 @@ static PyMethodDef ext_methods[] = { MOD_INIT(_dispatcher) { if (import_devicearray() < 0) { PyErr_Print(); - PyErr_SetString(PyExc_ImportError, "numba._devicearray failed to import"); + PyErr_SetString(PyExc_ImportError, "numba_cuda.cext._devicearray failed to import"); return MOD_ERROR_VAL; } diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp b/numba_cuda/cext/_hashtable.cpp similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp rename to numba_cuda/cext/_hashtable.cpp diff --git a/numba_cuda/numba/cuda/_dispatcher/_hashtable.h b/numba_cuda/cext/_hashtable.h similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/_hashtable.h rename to numba_cuda/cext/_hashtable.h diff --git a/numba_cuda/numba/cuda/_dispatcher/_pymodule.h b/numba_cuda/cext/_pymodule.h similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/_pymodule.h rename to numba_cuda/cext/_pymodule.h diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.cpp b/numba_cuda/cext/_typeof.cpp similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/_typeof.cpp rename to numba_cuda/cext/_typeof.cpp diff --git a/numba_cuda/numba/cuda/_dispatcher/_typeof.h b/numba_cuda/cext/_typeof.h similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/_typeof.h rename to numba_cuda/cext/_typeof.h diff --git a/numba_cuda/cext/mviewbuf.c b/numba_cuda/cext/mviewbuf.c new file mode 100644 index 000000000..33f0888c7 --- /dev/null +++ b/numba_cuda/cext/mviewbuf.c @@ -0,0 +1,382 @@ +#include "_pymodule.h" + +static int get_writable_buffer(PyObject* obj, Py_buffer *buf, int force) +{ + Py_buffer read_buf; + int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT; + int ret; + + /* Attempt to get a writable buffer */ + if (!PyObject_GetBuffer(obj, buf, flags|PyBUF_WRITABLE)) + return 0; + if (!force) + return -1; + + /* Make a writable buffer from a read-only buffer */ + PyErr_Clear(); + if(-1 == PyObject_GetBuffer(obj, &read_buf, flags)) + return -1; + ret = PyBuffer_FillInfo(buf, NULL, read_buf.buf, read_buf.len, 0, + flags|PyBUF_WRITABLE); + PyBuffer_Release(&read_buf); + return ret; +} + +static int get_readonly_buffer(PyObject* obj, Py_buffer *buf) +{ + int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT; + + return PyObject_GetBuffer(obj, buf, flags); +} + + +static void free_buffer(Py_buffer * buf) +{ + PyBuffer_Release(buf); +} + +/** + * Return a pointer to the data of a writable buffer from obj. If only a + * read-only buffer is available and force is True, a read-write buffer based on + * the read-only buffer is obtained. Note that this may have some surprising + * effects on buffers which expect the data from their read-only buffer not to + * be modified. + */ +static PyObject* +memoryview_get_buffer(PyObject *self, PyObject *args){ + PyObject *obj = NULL; + int force = 0; + int readonly = 0; + PyObject *ret = NULL; + Py_buffer buf; + + if (!PyArg_ParseTuple(args, "O|ii", &obj, &force, &readonly)) + return NULL; + + if (readonly) { + if (get_readonly_buffer(obj, &buf)) + return NULL; + } else { + if (get_writable_buffer(obj, &buf, force)) + return NULL; + } + + ret = PyLong_FromVoidPtr(buf.buf); + free_buffer(&buf); + return ret; +} + +/** + * Gets a half-open range [start, end) which contains the array data + * Modified from numpy/core/src/multiarray/array_assign.c + */ +static PyObject* +get_extents(Py_ssize_t *shape, Py_ssize_t *strides, int ndim, + Py_ssize_t itemsize, Py_ssize_t ptr) +{ + Py_ssize_t start, end; + int idim; + Py_ssize_t *dimensions = shape; + PyObject *ret = NULL; + + if (ndim < 0 ){ + PyErr_SetString(PyExc_ValueError, "buffer ndim < 0"); + return NULL; + } + + if (!dimensions) { + if (ndim == 0) { + start = end = ptr; + end += itemsize; + return Py_BuildValue("nn", start, end); + } + PyErr_SetString(PyExc_ValueError, "buffer shape is not defined"); + return NULL; + } + + if (!strides) { + PyErr_SetString(PyExc_ValueError, "buffer strides is not defined"); + return NULL; + } + + /* Calculate with a closed range [start, end] */ + start = end = ptr; + for (idim = 0; idim < ndim; ++idim) { + Py_ssize_t stride = strides[idim], dim = dimensions[idim]; + /* If the array size is zero, return an empty range */ + if (dim == 0) { + start = end = ptr; + ret = Py_BuildValue("nn", start, end); + break; + } + /* Expand either upwards or downwards depending on stride */ + else { + if (stride > 0) { + end += stride * (dim - 1); + } + else if (stride < 0) { + start += stride * (dim - 1); + } + } + } + + if (!ret) { + /* Return a half-open range */ + Py_ssize_t out_start = start; + Py_ssize_t out_end = end + itemsize; + + ret = Py_BuildValue("nn", out_start, out_end); + } + + return ret; +} + +static PyObject* +memoryview_get_extents(PyObject *self, PyObject *args) +{ + PyObject *obj = NULL; + PyObject *ret = NULL; + Py_buffer b; + if (!PyArg_ParseTuple(args, "O", &obj)) + return NULL; + + if (get_readonly_buffer(obj, &b)) + return NULL; + + ret = get_extents(b.shape, b.strides, b.ndim, b.itemsize, + (Py_ssize_t)b.buf); + free_buffer(&b); + return ret; +} + +static PyObject* +memoryview_get_extents_info(PyObject *self, PyObject *args) +{ + int i; + Py_ssize_t *shape_ary = NULL; + Py_ssize_t *strides_ary = NULL; + PyObject *shape_tuple = NULL; + PyObject *strides_tuple = NULL; + PyObject *shape = NULL, *strides = NULL; + Py_ssize_t itemsize = 0; + int ndim = 0; + PyObject* res = NULL; + + if (!PyArg_ParseTuple(args, "OOin", &shape, &strides, &ndim, &itemsize)) + goto cleanup; + + if (ndim < 0) { + PyErr_SetString(PyExc_ValueError, "ndim is negative"); + goto cleanup; + } + + if (itemsize <= 0) { + PyErr_SetString(PyExc_ValueError, "ndim <= 0"); + goto cleanup; + } + + shape_ary = malloc(sizeof(Py_ssize_t) * ndim + 1); + strides_ary = malloc(sizeof(Py_ssize_t) * ndim + 1); + + shape_tuple = PySequence_Fast(shape, "shape is not a sequence"); + if (!shape_tuple) goto cleanup; + + for (i = 0; i < ndim; ++i) { + shape_ary[i] = PyNumber_AsSsize_t( + PySequence_Fast_GET_ITEM(shape_tuple, i), + PyExc_OverflowError); + } + + strides_tuple = PySequence_Fast(strides, "strides is not a sequence"); + if (!strides_tuple) goto cleanup; + + for (i = 0; i < ndim; ++i) { + strides_ary[i] = PyNumber_AsSsize_t( + PySequence_Fast_GET_ITEM(strides_tuple, i), + PyExc_OverflowError); + } + + res = get_extents(shape_ary, strides_ary, ndim, itemsize, 0); +cleanup: + free(shape_ary); + free(strides_ary); + Py_XDECREF(shape_tuple); + Py_XDECREF(strides_tuple); + return res; +} + + +/* new type to expose buffer interface */ +typedef struct { + PyObject_HEAD + /* Type-specific fields go here. */ +} MemAllocObject; + + +static int +get_bufinfo(PyObject *self, Py_ssize_t *psize, void **pptr) +{ + PyObject *buflen = NULL; + PyObject *bufptr = NULL; + Py_ssize_t size = 0; + void* ptr = NULL; + int ret = -1; + + buflen = PyObject_GetAttrString(self, "_buflen_"); + if (!buflen) goto cleanup; + + bufptr = PyObject_GetAttrString(self, "_bufptr_"); + if (!bufptr) goto cleanup; + + size = PyNumber_AsSsize_t(buflen, PyExc_OverflowError); + if (size == -1 && PyErr_Occurred()) goto cleanup; + else if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative buffer size"); + goto cleanup; + } + + ptr = PyLong_AsVoidPtr(PyNumber_Long(bufptr)); + if (PyErr_Occurred()) + goto cleanup; + else if (!ptr) { + PyErr_SetString(PyExc_ValueError, "null buffer pointer"); + goto cleanup; + } + + *psize = size; + *pptr = ptr; + ret = 0; +cleanup: + Py_XDECREF(buflen); + Py_XDECREF(bufptr); + return ret; +} + + +static int +MemAllocObject_getbuffer(PyObject *self, Py_buffer *view, int flags) +{ + Py_ssize_t size = 0; + void *ptr = 0; + int readonly; + + if(-1 == get_bufinfo(self, &size, &ptr)) + return -1; + + readonly = (PyBUF_WRITABLE & flags) != PyBUF_WRITABLE; + + /* fill buffer */ + if (-1 == PyBuffer_FillInfo(view, self, (void*)ptr, size, readonly, flags)) + return -1; + + return 0; +} + +static void +MemAllocObject_releasebuffer(PyObject *self, Py_buffer *view) +{ + /* Do nothing */ +} + +static PyBufferProcs MemAlloc_as_buffer = { + MemAllocObject_getbuffer, + MemAllocObject_releasebuffer, +}; + + +static PyTypeObject MemAllocType = { + PyVarObject_HEAD_INIT(NULL, 0) + "mviewbuf.MemAlloc", /* tp_name */ + sizeof(MemAllocObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + &MemAlloc_as_buffer, /* tp_as_buffer */ + (Py_TPFLAGS_DEFAULT| Py_TPFLAGS_BASETYPE), /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ + 0, /* tp_vectorcall */ +#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12) +/* This was introduced first in 3.12 + * https://github.com/python/cpython/issues/91051 + */ + 0, /* tp_watched */ +#endif + +/* WARNING: Do not remove this, only modify it! It is a version guard to + * act as a reminder to update this struct on Python version update! */ +#if (PY_MAJOR_VERSION == 3) +#if ! (NB_SUPPORTED_PYTHON_MINOR) +#error "Python minor version is not supported." +#endif +#else +#error "Python major version is not supported." +#endif +/* END WARNING*/ +}; + + +static PyMethodDef core_methods[] = { +#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL } + declmethod(memoryview_get_buffer), + declmethod(memoryview_get_extents), + declmethod(memoryview_get_extents_info), + { NULL }, +#undef declmethod +}; + + +MOD_INIT(mviewbuf) { + PyObject *module; + MOD_DEF(module, "mviewbuf", "No docs", core_methods) + if (module == NULL) + return MOD_ERROR_VAL; + + MemAllocType.tp_new = PyType_GenericNew; + if (PyType_Ready(&MemAllocType) < 0){ + return MOD_ERROR_VAL; + } + + Py_INCREF(&MemAllocType); + PyModule_AddObject(module, "MemAlloc", (PyObject*)&MemAllocType); + + return MOD_SUCCESS_VAL(module); +} diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.cpp b/numba_cuda/cext/typeconv.cpp similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/typeconv.cpp rename to numba_cuda/cext/typeconv.cpp diff --git a/numba_cuda/numba/cuda/_dispatcher/typeconv.hpp b/numba_cuda/cext/typeconv.hpp similarity index 100% rename from numba_cuda/numba/cuda/_dispatcher/typeconv.hpp rename to numba_cuda/cext/typeconv.hpp diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 819d8a6e4..0a6b3cb88 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -13,7 +13,7 @@ import numpy as np import numba -from numba import _devicearray +from numba_cuda.cext import _devicearray from numba.cuda.cudadrv import devices, dummyarray from numba.cuda.cudadrv import driver as _driver from numba.core import types, config diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 7470cb8b5..fdce25eb5 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -43,7 +43,7 @@ from collections import namedtuple, deque -from numba import mviewbuf +from numba_cuda.cext import mviewbuf from numba.core import config from numba.cuda import utils, serialize from .error import CudaSupportError, CudaDriverError diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index a88093e31..8d38cf333 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -38,7 +38,7 @@ from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY from numba import cuda -from numba.cuda import _dispatcher +from numba_cuda.cext import _dispatcher from warnings import warn diff --git a/setup.py b/setup.py index c4a4b671b..7ecc40714 100644 --- a/setup.py +++ b/setup.py @@ -37,28 +37,49 @@ def get_ext_modules(): "m", ] + ext_devicearray = Extension( + name="numba_cuda.cext._devicearray", + sources=["numba_cuda/cext/_devicearray.cpp"], + depends=[ + "numba_cuda/cext/_pymodule.h", + "numba_cuda/cext/_devicearray.h", + ], + include_dirs=["numba_cuda/cext"], + extra_compile_args=["-std=c++11"], + ) + + install_name_tool_fixer = [] + if sys.platform == "darwin": + install_name_tool_fixer = ["-headerpad_max_install_names"] + + ext_mviewbuf = Extension( + name="numba_cuda.cext.mviewbuf", + extra_link_args=install_name_tool_fixer, + sources=["numba_cuda/cext/mviewbuf.c"], + ) + dispatcher_sources = [ - "numba_cuda/numba/cuda/_dispatcher/_dispatcher.cpp", - "numba_cuda/numba/cuda/_dispatcher/_typeof.cpp", - "numba_cuda/numba/cuda/_dispatcher/_hashtable.cpp", - "numba_cuda/numba/cuda/_dispatcher/typeconv.cpp", + "numba_cuda/cext/_dispatcher.cpp", + "numba_cuda/cext/_typeof.cpp", + "numba_cuda/cext/_hashtable.cpp", + "numba_cuda/cext/typeconv.cpp", ] ext_dispatcher = Extension( - name="numba_cuda.numba.cuda._dispatcher", + name="numba_cuda.cext._dispatcher", sources=dispatcher_sources, depends=[ - "numba_cuda/numba/cuda/_dispatcher/_pymodule.h", - "numba_cuda/numba/cuda/_dispatcher/_typeof.h", - "numba_cuda/numba/cuda/_dispatcher/_hashtable.h", + "numba_cuda/cext/_pymodule.h", + "numba_cuda/cext/_typeof.h", + "numba_cuda/cext/_hashtable.h", ], extra_compile_args=["-std=c++11"], **np_compile_args, ) - # Add our include directory to the existing include_dirs - ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/_dispatcher") + # Append our cext dir to include_dirs + ext_dispatcher.include_dirs.append("numba_cuda/cext") - return [ext_dispatcher] + return [ext_dispatcher, ext_mviewbuf, ext_devicearray] def is_building(): From 5c80d566cfd156323bc345a2109fc506ea1693e1 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 12 Aug 2025 10:25:34 -0700 Subject: [PATCH 09/62] Update CI numpy version reqs and build matrix --- ci/matrix.yml | 8 ++++++++ conda/recipes/numba-cuda/meta.yaml | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/matrix.yml b/ci/matrix.yml index 47019cc16..2e771b820 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -1,7 +1,15 @@ build-matrix: + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } simulator-matrix: + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } test-matrix: - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest' } - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4', DRIVER: 'latest' } diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index cba6deb08..6beeac6a7 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -25,7 +25,8 @@ requirements: - python - pip - setuptools - - numpy + - numpy >= 2.1.0 # [py >= 310] + - numpy == 1.26.0 # [py == 39] run: - python - numba >=0.59.1 From 22a41f89a6501dc089fca39d00a04217f7239f14 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 12 Aug 2025 10:28:29 -0700 Subject: [PATCH 10/62] Fix numpy ver requirements --- conda/recipes/numba-cuda/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index 6beeac6a7..7a26d6ad0 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -25,8 +25,8 @@ requirements: - python - pip - setuptools - - numpy >= 2.1.0 # [py >= 310] - - numpy == 1.26.0 # [py == 39] + - numpy >=2.1.0 # [py >= 310] + - numpy ==1.26.0 # [py == 39] run: - python - numba >=0.59.1 From 47e5b1851ab33543d4e707840ef6021d5b3e482f Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 12 Aug 2025 10:34:57 -0700 Subject: [PATCH 11/62] Remove additional configs in build-matrix and simulator-matrix --- ci/matrix.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ci/matrix.yml b/ci/matrix.yml index 2e771b820..47019cc16 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -1,15 +1,7 @@ build-matrix: - - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } simulator-matrix: - - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } test-matrix: - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest' } - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4', DRIVER: 'latest' } From 52da47eeaa4c3ef4b583e4163fc0bbc460101ccb Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 12 Aug 2025 11:02:35 -0700 Subject: [PATCH 12/62] Fix setup.py build_commands list --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7ecc40714..448547e0b 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,8 @@ def is_building(): "build", "build_py", "build_ext", - "build_clibbuild_scripts", + "build_clib", + "build_scripts", "install", "install_lib", "install_headers", @@ -109,7 +110,7 @@ def is_building(): "bdist_rpm", "bdist_wininst", "check", - "build_doc", + "build_docs", "bdist_wheel", "bdist_egg", "develop", From 366b3d74720508285f689201de03b2656e43ed58 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 19 Aug 2025 21:13:39 +0100 Subject: [PATCH 13/62] Make numba_cuda.cext a package --- numba_cuda/cext/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 numba_cuda/cext/__init__.py diff --git a/numba_cuda/cext/__init__.py b/numba_cuda/cext/__init__.py new file mode 100644 index 000000000..e69de29bb From 487e872a943ceaa69a6f7b7bcbd969068783aecd Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 19 Aug 2025 21:50:52 +0100 Subject: [PATCH 14/62] Modify build matrix for each Python version This is not sufficient. We also need: - One build for each Python version for both of amd64 and aarch64 - The docs build to be changed to download the py313-amd64 wheels - Maybe other things I missed? --- .github/workflows/conda-python-build.yaml | 2 +- .github/workflows/conda-python-tests.yaml | 2 +- .github/workflows/wheels-build.yaml | 4 ++-- .github/workflows/wheels-test.yaml | 2 +- ci/matrix.yml | 4 ++++ 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml index bec7bae68..f71890e91 100644 --- a/.github/workflows/conda-python-build.yaml +++ b/.github/workflows/conda-python-build.yaml @@ -101,7 +101,7 @@ jobs: if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: conda-repo + name: conda-repo-py${{ matrix.PY_VER }} path: "/tmp/conda-bld-output" - name: Publish conda package if: inputs.upload_to_anaconda diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml index 3623fd40d..3f7552ede 100644 --- a/.github/workflows/conda-python-tests.yaml +++ b/.github/workflows/conda-python-tests.yaml @@ -93,7 +93,7 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo + name: conda-repo-py${{ matrix.PY_VER }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml index 2c86cc1b4..15e39fb56 100644 --- a/.github/workflows/wheels-build.yaml +++ b/.github/workflows/wheels-build.yaml @@ -149,11 +149,11 @@ jobs: if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: sdist + name: sdist-py${{ matrix.PY_VER }} path: ${{ env.sdist_path }} - name: Upload wheel if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: wheel + name: wheel-py${{ matrix.PY_VER }} path: ${{ env.wheel_path }} diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml index 3c6d294d3..fe354ff77 100644 --- a/.github/workflows/wheels-test.yaml +++ b/.github/workflows/wheels-test.yaml @@ -111,7 +111,7 @@ jobs: fetch-depth: 0 # unshallow fetch for setuptools-scm persist-credentials: false - uses: actions/download-artifact@v4 - name: wheel + name: wheel-py${{ matrix.PY_VER }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/ci/matrix.yml b/ci/matrix.yml index 47019cc16..611930905 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -1,4 +1,8 @@ build-matrix: + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } simulator-matrix: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } From 42bea6ab6530ea1954fae5066f187beed93f81b7 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 26 Aug 2025 11:56:07 -0700 Subject: [PATCH 15/62] Move cext to numba_cuda.numba.cuda and enable build matrix for arm64 --- .github/workflows/conda-python-build.yaml | 2 +- .github/workflows/conda-python-tests.yaml | 2 +- .github/workflows/wheels-build.yaml | 4 +-- .github/workflows/wheels-test.yaml | 2 +- ci/matrix.yml | 5 +++ numba_cuda/{ => numba/cuda}/cext/__init__.py | 0 .../{ => numba/cuda}/cext/_devicearray.cpp | 2 +- .../{ => numba/cuda}/cext/_devicearray.h | 0 .../{ => numba/cuda}/cext/_dispatcher.cpp | 7 ++-- .../{ => numba/cuda}/cext/_hashtable.cpp | 0 numba_cuda/{ => numba/cuda}/cext/_hashtable.h | 0 numba_cuda/{ => numba/cuda}/cext/_pymodule.h | 0 numba_cuda/{ => numba/cuda}/cext/_typeof.cpp | 0 numba_cuda/{ => numba/cuda}/cext/_typeof.h | 0 numba_cuda/{ => numba/cuda}/cext/mviewbuf.c | 0 numba_cuda/{ => numba/cuda}/cext/typeconv.cpp | 0 numba_cuda/{ => numba/cuda}/cext/typeconv.hpp | 0 numba_cuda/numba/cuda/cudadrv/devicearray.py | 2 +- numba_cuda/numba/cuda/cudadrv/driver.py | 2 +- numba_cuda/numba/cuda/dispatcher.py | 2 +- setup.py | 32 +++++++++---------- 21 files changed, 33 insertions(+), 29 deletions(-) rename numba_cuda/{ => numba/cuda}/cext/__init__.py (100%) rename numba_cuda/{ => numba/cuda}/cext/_devicearray.cpp (99%) rename numba_cuda/{ => numba/cuda}/cext/_devicearray.h (100%) rename numba_cuda/{ => numba/cuda}/cext/_dispatcher.cpp (99%) rename numba_cuda/{ => numba/cuda}/cext/_hashtable.cpp (100%) rename numba_cuda/{ => numba/cuda}/cext/_hashtable.h (100%) rename numba_cuda/{ => numba/cuda}/cext/_pymodule.h (100%) rename numba_cuda/{ => numba/cuda}/cext/_typeof.cpp (100%) rename numba_cuda/{ => numba/cuda}/cext/_typeof.h (100%) rename numba_cuda/{ => numba/cuda}/cext/mviewbuf.c (100%) rename numba_cuda/{ => numba/cuda}/cext/typeconv.cpp (100%) rename numba_cuda/{ => numba/cuda}/cext/typeconv.hpp (100%) diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml index f71890e91..84e6a6951 100644 --- a/.github/workflows/conda-python-build.yaml +++ b/.github/workflows/conda-python-build.yaml @@ -101,7 +101,7 @@ jobs: if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: conda-repo-py${{ matrix.PY_VER }} + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} path: "/tmp/conda-bld-output" - name: Publish conda package if: inputs.upload_to_anaconda diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml index 3f7552ede..02ead9985 100644 --- a/.github/workflows/conda-python-tests.yaml +++ b/.github/workflows/conda-python-tests.yaml @@ -93,7 +93,7 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo-py${{ matrix.PY_VER }} + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml index 15e39fb56..c876bfb16 100644 --- a/.github/workflows/wheels-build.yaml +++ b/.github/workflows/wheels-build.yaml @@ -149,11 +149,11 @@ jobs: if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: sdist-py${{ matrix.PY_VER }} + name: sdist-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} path: ${{ env.sdist_path }} - name: Upload wheel if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: wheel-py${{ matrix.PY_VER }} + name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} path: ${{ env.wheel_path }} diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml index fe354ff77..f45b5f1fd 100644 --- a/.github/workflows/wheels-test.yaml +++ b/.github/workflows/wheels-test.yaml @@ -111,7 +111,7 @@ jobs: fetch-depth: 0 # unshallow fetch for setuptools-scm persist-credentials: false - uses: actions/download-artifact@v4 - name: wheel-py${{ matrix.PY_VER }} + name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/ci/matrix.yml b/ci/matrix.yml index 611930905..d8da7b351 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -4,6 +4,11 @@ build-matrix: - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } simulator-matrix: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } test-matrix: diff --git a/numba_cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py similarity index 100% rename from numba_cuda/cext/__init__.py rename to numba_cuda/numba/cuda/cext/__init__.py diff --git a/numba_cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp similarity index 99% rename from numba_cuda/cext/_devicearray.cpp rename to numba_cuda/numba/cuda/cext/_devicearray.cpp index a822c6261..645aafdce 100644 --- a/numba_cuda/cext/_devicearray.cpp +++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp @@ -120,7 +120,7 @@ MOD_INIT(_devicearray) { if (m == NULL) goto error_occurred; - c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda.cext._devicearray._DEVICEARRAY_API", NULL); + c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda._devicearray._DEVICEARRAY_API", NULL); if (c_api == NULL) goto error_occurred; diff --git a/numba_cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h similarity index 100% rename from numba_cuda/cext/_devicearray.h rename to numba_cuda/numba/cuda/cext/_devicearray.h diff --git a/numba_cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp similarity index 99% rename from numba_cuda/cext/_dispatcher.cpp rename to numba_cuda/numba/cuda/cext/_dispatcher.cpp index b2f3e1c54..2e8e1a882 100644 --- a/numba_cuda/cext/_dispatcher.cpp +++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp @@ -935,14 +935,13 @@ Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws) static int import_devicearray(void) { - // Import our vendored devicearray (following original numba's pattern) - PyObject *devicearray = PyImport_ImportModule("numba_cuda.cext._devicearray"); + PyObject *devicearray = PyImport_ImportModule("numba_cuda._devicearray"); if (devicearray == NULL) { return -1; } Py_DECREF(devicearray); - DeviceArray_API = (void**)PyCapsule_Import("numba_cuda.cext._devicearray._DEVICEARRAY_API", 0); + DeviceArray_API = (void**)PyCapsule_Import("numba_cuda._devicearray._DEVICEARRAY_API", 0); if (DeviceArray_API == NULL) { return -1; } @@ -1057,7 +1056,7 @@ static PyMethodDef ext_methods[] = { MOD_INIT(_dispatcher) { if (import_devicearray() < 0) { PyErr_Print(); - PyErr_SetString(PyExc_ImportError, "numba_cuda.cext._devicearray failed to import"); + PyErr_SetString(PyExc_ImportError, "numba_cuda._devicearray failed to import"); return MOD_ERROR_VAL; } diff --git a/numba_cuda/cext/_hashtable.cpp b/numba_cuda/numba/cuda/cext/_hashtable.cpp similarity index 100% rename from numba_cuda/cext/_hashtable.cpp rename to numba_cuda/numba/cuda/cext/_hashtable.cpp diff --git a/numba_cuda/cext/_hashtable.h b/numba_cuda/numba/cuda/cext/_hashtable.h similarity index 100% rename from numba_cuda/cext/_hashtable.h rename to numba_cuda/numba/cuda/cext/_hashtable.h diff --git a/numba_cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h similarity index 100% rename from numba_cuda/cext/_pymodule.h rename to numba_cuda/numba/cuda/cext/_pymodule.h diff --git a/numba_cuda/cext/_typeof.cpp b/numba_cuda/numba/cuda/cext/_typeof.cpp similarity index 100% rename from numba_cuda/cext/_typeof.cpp rename to numba_cuda/numba/cuda/cext/_typeof.cpp diff --git a/numba_cuda/cext/_typeof.h b/numba_cuda/numba/cuda/cext/_typeof.h similarity index 100% rename from numba_cuda/cext/_typeof.h rename to numba_cuda/numba/cuda/cext/_typeof.h diff --git a/numba_cuda/cext/mviewbuf.c b/numba_cuda/numba/cuda/cext/mviewbuf.c similarity index 100% rename from numba_cuda/cext/mviewbuf.c rename to numba_cuda/numba/cuda/cext/mviewbuf.c diff --git a/numba_cuda/cext/typeconv.cpp b/numba_cuda/numba/cuda/cext/typeconv.cpp similarity index 100% rename from numba_cuda/cext/typeconv.cpp rename to numba_cuda/numba/cuda/cext/typeconv.cpp diff --git a/numba_cuda/cext/typeconv.hpp b/numba_cuda/numba/cuda/cext/typeconv.hpp similarity index 100% rename from numba_cuda/cext/typeconv.hpp rename to numba_cuda/numba/cuda/cext/typeconv.hpp diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 0a6b3cb88..31d50abe2 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -13,7 +13,7 @@ import numpy as np import numba -from numba_cuda.cext import _devicearray +from numba_cuda import _devicearray from numba.cuda.cudadrv import devices, dummyarray from numba.cuda.cudadrv import driver as _driver from numba.core import types, config diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index fdce25eb5..f88fa6712 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -43,7 +43,7 @@ from collections import namedtuple, deque -from numba_cuda.cext import mviewbuf +from numba.cuda import mviewbuf from numba.core import config from numba.cuda import utils, serialize from .error import CudaSupportError, CudaDriverError diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 8d38cf333..a88093e31 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -38,7 +38,7 @@ from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY from numba import cuda -from numba_cuda.cext import _dispatcher +from numba.cuda import _dispatcher from warnings import warn diff --git a/setup.py b/setup.py index 448547e0b..a0a637bc2 100644 --- a/setup.py +++ b/setup.py @@ -38,13 +38,13 @@ def get_ext_modules(): ] ext_devicearray = Extension( - name="numba_cuda.cext._devicearray", - sources=["numba_cuda/cext/_devicearray.cpp"], + name="numba_cuda._devicearray", + sources=["numba_cuda/numba/cuda/cext/_devicearray.cpp"], depends=[ - "numba_cuda/cext/_pymodule.h", - "numba_cuda/cext/_devicearray.h", + "numba_cuda/numba/cuda/cext/_pymodule.h", + "numba_cuda/numba/cuda/cext/_devicearray.h", ], - include_dirs=["numba_cuda/cext"], + include_dirs=["numba_cuda/numba/cuda/cext"], extra_compile_args=["-std=c++11"], ) @@ -53,31 +53,31 @@ def get_ext_modules(): install_name_tool_fixer = ["-headerpad_max_install_names"] ext_mviewbuf = Extension( - name="numba_cuda.cext.mviewbuf", + name="numba_cuda.numba.cuda.mviewbuf", extra_link_args=install_name_tool_fixer, - sources=["numba_cuda/cext/mviewbuf.c"], + sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"], ) dispatcher_sources = [ - "numba_cuda/cext/_dispatcher.cpp", - "numba_cuda/cext/_typeof.cpp", - "numba_cuda/cext/_hashtable.cpp", - "numba_cuda/cext/typeconv.cpp", + "numba_cuda/numba/cuda/cext/_dispatcher.cpp", + "numba_cuda/numba/cuda/cext/_typeof.cpp", + "numba_cuda/numba/cuda/cext/_hashtable.cpp", + "numba_cuda/numba/cuda/cext/typeconv.cpp", ] ext_dispatcher = Extension( - name="numba_cuda.cext._dispatcher", + name="numba_cuda.numba.cuda._dispatcher", sources=dispatcher_sources, depends=[ - "numba_cuda/cext/_pymodule.h", - "numba_cuda/cext/_typeof.h", - "numba_cuda/cext/_hashtable.h", + "numba_cuda/numba/cuda/cext/_pymodule.h", + "numba_cuda/numba/cuda/cext/_typeof.h", + "numba_cuda/numba/cuda/cext/_hashtable.h", ], extra_compile_args=["-std=c++11"], **np_compile_args, ) # Append our cext dir to include_dirs - ext_dispatcher.include_dirs.append("numba_cuda/cext") + ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/cext") return [ext_dispatcher, ext_mviewbuf, ext_devicearray] From 7cfe7c3f5253751d29f48b402e9aea41fe10c112 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 26 Aug 2025 12:02:49 -0700 Subject: [PATCH 16/62] Make py3.9 arm64 and amd64 builds use cuda_ver 11.8.0 --- ci/matrix.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/matrix.yml b/ci/matrix.yml index d8da7b351..247027305 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -1,10 +1,10 @@ build-matrix: - - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } From 88c0b2cda064d46eb2a05c29e2bcc01a1e2edb66 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 26 Aug 2025 12:19:23 -0700 Subject: [PATCH 17/62] Make py3.9 a supported python version for cext. --- numba_cuda/numba/cuda/cext/_pymodule.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h index c261314f5..47eedcb8a 100644 --- a/numba_cuda/numba/cuda/cext/_pymodule.h +++ b/numba_cuda/numba/cuda/cext/_pymodule.h @@ -30,6 +30,6 @@ Py_DECREF(tmp); } while (0) -#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) +#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 9) || (PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) #endif /* NUMBA_PY_MODULE_H_ */ From 970e5bc51bb9a205ddf6f65e25c1f2a5aac86140 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 26 Aug 2025 12:29:21 -0700 Subject: [PATCH 18/62] Make CI build-docs and simulator workflows point to correct build --- .github/workflows/docs-build.yaml | 2 +- .github/workflows/simulator-test.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml index ac9b8f244..d1b6a23d0 100644 --- a/.github/workflows/docs-build.yaml +++ b/.github/workflows/docs-build.yaml @@ -56,7 +56,7 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml index 015efe8bb..625a69882 100644 --- a/.github/workflows/simulator-test.yaml +++ b/.github/workflows/simulator-test.yaml @@ -83,7 +83,7 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information From afaea07b6b20f720a59ce0b388b869959605728f Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Tue, 26 Aug 2025 13:42:54 -0700 Subject: [PATCH 19/62] Make CI scripts point to the right conda-repo-py dirs --- ci/build_docs.sh | 13 ++++++++++++- ci/test_conda.sh | 13 ++++++++++++- ci/test_conda_ctypes_binding.sh | 13 ++++++++++++- ci/test_simulator.sh | 13 ++++++++++++- 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 74816c6f2..70d27619b 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -20,7 +20,18 @@ set +u conda activate docs set -u -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda +# Detect system architecture to set conda repo path +ARCH=$(uname -m) +if [[ "$ARCH" == "x86_64" ]]; then + ARCH_SUFFIX="amd64" +elif [[ "$ARCH" == "aarch64" ]]; then + ARCH_SUFFIX="arm64" +else + echo "Unsupported architecture: $ARCH" + exit 1 +fi + +rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda pip install nvidia-sphinx-theme diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 70f4dc124..07ea5dbf5 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -38,7 +38,18 @@ set -u pip install filecheck -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda +# Detect system architecture to set conda repo path +ARCH=$(uname -m) +if [[ "$ARCH" == "x86_64" ]]; then + ARCH_SUFFIX="amd64" +elif [[ "$ARCH" == "aarch64" ]]; then + ARCH_SUFFIX="arm64" +else + echo "Unsupported architecture: $ARCH" + exit 1 +fi + +rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh index a274c021e..7aa11aab3 100755 --- a/ci/test_conda_ctypes_binding.sh +++ b/ci/test_conda_ctypes_binding.sh @@ -33,7 +33,18 @@ set -u pip install filecheck -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda +# Detect system architecture to set conda repo path +ARCH=$(uname -m) +if [[ "$ARCH" == "x86_64" ]]; then + ARCH_SUFFIX="amd64" +elif [[ "$ARCH" == "aarch64" ]]; then + ARCH_SUFFIX="arm64" +else + echo "Unsupported architecture: $ARCH" + exit 1 +fi + +rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh index 832e2cbf7..e545bee49 100755 --- a/ci/test_simulator.sh +++ b/ci/test_simulator.sh @@ -21,7 +21,18 @@ set -u pip install filecheck -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda +# Detect system architecture to set conda repo path +ARCH=$(uname -m) +if [[ "$ARCH" == "x86_64" ]]; then + ARCH_SUFFIX="amd64" +elif [[ "$ARCH" == "aarch64" ]]; then + ARCH_SUFFIX="arm64" +else + echo "Unsupported architecture: $ARCH" + exit 1 +fi + +rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" From f9d9e22cc2233498b2db96a261de325d9902f0db Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Wed, 27 Aug 2025 09:35:29 -0700 Subject: [PATCH 20/62] Change numpy ver for py3.9 build to 2.0.2 to support 1.x and 2.x --- conda/recipes/numba-cuda/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index 7a26d6ad0..658cfdeac 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -26,7 +26,7 @@ requirements: - pip - setuptools - numpy >=2.1.0 # [py >= 310] - - numpy ==1.26.0 # [py == 39] + - numpy ==2.0.2 # [py == 39] run: - python - numba >=0.59.1 From 24c5fda8af95ba2500101da955fee2f07bffdf61 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Wed, 27 Aug 2025 15:24:19 -0700 Subject: [PATCH 21/62] CI with pip install -e to ensure C extensions are properly linked --- conda/recipes/numba-cuda/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index 658cfdeac..b0876963d 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -18,7 +18,7 @@ source: build: noarch: python script: - - {{ PYTHON }} -m pip install . -vv + - {{ PYTHON }} -m pip install . -e -vv requirements: host: From f289d122055504abfa30cf5ee74b7b52cb115000 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Wed, 27 Aug 2025 15:25:25 -0700 Subject: [PATCH 22/62] Fix CI build --- conda/recipes/numba-cuda/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index b0876963d..65cef4692 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -18,7 +18,7 @@ source: build: noarch: python script: - - {{ PYTHON }} -m pip install . -e -vv + - {{ PYTHON }} -m pip install -e . -vv requirements: host: From 3c48a3d1946db3018ed0ba1f07208e4f5e722e04 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Fri, 29 Aug 2025 16:37:19 -0700 Subject: [PATCH 23/62] Hack C extensions into import mechanism --- numba_cuda/numba/cuda/cext/__init__.py | 91 ++++++++++++++++++++ numba_cuda/numba/cuda/cext/_devicearray.cpp | 2 +- numba_cuda/numba/cuda/cext/_devicearray.h | 1 + numba_cuda/numba/cuda/cext/_dispatcher.cpp | 6 +- numba_cuda/numba/cuda/cudadrv/devicearray.py | 2 +- numba_cuda/numba/cuda/cudadrv/driver.py | 2 +- numba_cuda/numba/cuda/dispatcher.py | 2 +- setup.py | 4 +- 8 files changed, 101 insertions(+), 9 deletions(-) diff --git a/numba_cuda/numba/cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py index e69de29bb..4b1816572 100644 --- a/numba_cuda/numba/cuda/cext/__init__.py +++ b/numba_cuda/numba/cuda/cext/__init__.py @@ -0,0 +1,91 @@ +import sys +import importlib +import importlib.util +import importlib.machinery +from pathlib import Path +from types import ModuleType +from importlib.machinery import ModuleSpec + + +def _load_ext_from_spec( + spec: ModuleSpec, fullname: str, legacy_name: str +) -> ModuleType: + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[fullname] = module + sys.modules[legacy_name] = ( + module # Register under legacy name for C extensions + ) + + # Ensure parent modules exist for legacy name (e.g., numba_cuda for numba_cuda._devicearray) + parts = legacy_name.split(".") + for i in range(1, len(parts)): + parent_name = ".".join(parts[:i]) + if parent_name not in sys.modules: + parent_module = ModuleType(parent_name) + sys.modules[parent_name] = parent_module + + # Set the child as an attribute of the parent + parent_module = sys.modules[parent_name] + child_name = parts[i] + if i == len(parts) - 1: # This is the final module + setattr(parent_module, child_name, module) + elif not hasattr(parent_module, child_name): + # Create intermediate module if it doesn't exist + intermediate_name = ".".join(parts[: i + 1]) + if intermediate_name not in sys.modules: + intermediate_module = ModuleType(intermediate_name) + sys.modules[intermediate_name] = intermediate_module + setattr(parent_module, child_name, intermediate_module) + + spec.loader.exec_module(module) + return module + + +def _find_in_dir( + module_name: str, directory: Path | str | None +) -> ModuleSpec | None: + if not directory: + return None + return importlib.machinery.PathFinder.find_spec( + module_name, [str(directory)] + ) + + +def _load_cext_module( + module_basename: str, required: bool = True +) -> ModuleType | None: + fullname = f"numba.cuda.cext.{module_basename}" + legacy_name = f"numba_cuda.{module_basename}" + + # 1) Try local numba_cuda directory (for development builds) + local_numba_cuda = Path(__file__).parents[ + 3 + ] # Go up from cext/ to numba_cuda/ + spec = _find_in_dir(module_basename, local_numba_cuda) + + # 2) Fallback: scan sys.path for installed numba_cuda directory + if spec is None: + for entry in sys.path: + numba_cuda_dir = Path(entry) / "numba_cuda" + spec = _find_in_dir(module_basename, numba_cuda_dir) + if spec is not None: + break + + if spec is None: + if required: + raise ModuleNotFoundError( + f"Could not find '{module_basename}' in numba_cuda directories" + ) + return None + + return _load_ext_from_spec(spec, fullname, legacy_name) + + +# Load known cext modules (all required) +# Load _devicearray first since _dispatcher depends on it +_devicearray = _load_cext_module("_devicearray", required=True) +_dispatcher = _load_cext_module("_dispatcher", required=True) +mviewbuf = _load_cext_module("mviewbuf", required=True) + +__all__ = ["mviewbuf", "_dispatcher", "_devicearray"] diff --git a/numba_cuda/numba/cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp index 645aafdce..cdfc73595 100644 --- a/numba_cuda/numba/cuda/cext/_devicearray.cpp +++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp @@ -120,7 +120,7 @@ MOD_INIT(_devicearray) { if (m == NULL) goto error_occurred; - c_api = PyCapsule_New((void *)_DeviceArray_API, "numba_cuda._devicearray._DEVICEARRAY_API", NULL); + c_api = PyCapsule_New((void *)_DeviceArray_API, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API", NULL); if (c_api == NULL) goto error_occurred; diff --git a/numba_cuda/numba/cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h index 5b276eacf..ab469553a 100644 --- a/numba_cuda/numba/cuda/cext/_devicearray.h +++ b/numba_cuda/numba/cuda/cext/_devicearray.h @@ -5,6 +5,7 @@ extern "C" { #endif +#define NUMBA_DEVICEARRAY_IMPORT_NAME "numba_cuda._devicearray" /* These definitions should only be used by consumers of the Device Array API. * Consumers access the API through the opaque pointer stored in * _devicearray._DEVICEARRAY_API. We don't want these definitions in diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp index 2e8e1a882..fc3757a10 100644 --- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp +++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp @@ -935,13 +935,13 @@ Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws) static int import_devicearray(void) { - PyObject *devicearray = PyImport_ImportModule("numba_cuda._devicearray"); + PyObject *devicearray = PyImport_ImportModule(NUMBA_DEVICEARRAY_IMPORT_NAME); if (devicearray == NULL) { return -1; } Py_DECREF(devicearray); - DeviceArray_API = (void**)PyCapsule_Import("numba_cuda._devicearray._DEVICEARRAY_API", 0); + DeviceArray_API = (void**)PyCapsule_Import(NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API", 0); if (DeviceArray_API == NULL) { return -1; } @@ -1056,7 +1056,7 @@ static PyMethodDef ext_methods[] = { MOD_INIT(_dispatcher) { if (import_devicearray() < 0) { PyErr_Print(); - PyErr_SetString(PyExc_ImportError, "numba_cuda._devicearray failed to import"); + PyErr_SetString(PyExc_ImportError, NUMBA_DEVICEARRAY_IMPORT_NAME " failed to import"); return MOD_ERROR_VAL; } diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 31d50abe2..e0799894f 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -13,7 +13,7 @@ import numpy as np import numba -from numba_cuda import _devicearray +from numba.cuda.cext import _devicearray from numba.cuda.cudadrv import devices, dummyarray from numba.cuda.cudadrv import driver as _driver from numba.core import types, config diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index f88fa6712..84f94c1a1 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -43,7 +43,7 @@ from collections import namedtuple, deque -from numba.cuda import mviewbuf +from numba.cuda.cext import mviewbuf from numba.core import config from numba.cuda import utils, serialize from .error import CudaSupportError, CudaDriverError diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index a88093e31..f471f11ff 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -38,7 +38,7 @@ from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY from numba import cuda -from numba.cuda import _dispatcher +from numba.cuda.cext import _dispatcher from warnings import warn diff --git a/setup.py b/setup.py index a0a637bc2..7ab1dd3af 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def get_ext_modules(): install_name_tool_fixer = ["-headerpad_max_install_names"] ext_mviewbuf = Extension( - name="numba_cuda.numba.cuda.mviewbuf", + name="numba_cuda.mviewbuf", extra_link_args=install_name_tool_fixer, sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"], ) @@ -65,7 +65,7 @@ def get_ext_modules(): "numba_cuda/numba/cuda/cext/typeconv.cpp", ] ext_dispatcher = Extension( - name="numba_cuda.numba.cuda._dispatcher", + name="numba_cuda._dispatcher", sources=dispatcher_sources, depends=[ "numba_cuda/numba/cuda/cext/_pymodule.h", From 618a885b3fd007ad7131a16aa024f553802b6c00 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Fri, 29 Aug 2025 16:48:27 -0700 Subject: [PATCH 24/62] Fix redirector --- site-packages/_numba_cuda_redirector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py index 1c76609ac..ffeca64a5 100644 --- a/site-packages/_numba_cuda_redirector.py +++ b/site-packages/_numba_cuda_redirector.py @@ -31,7 +31,7 @@ def ensure_initialized(self): numba_cuda_spec = importlib.util.find_spec("numba_cuda") - if numba_spec is None: + if numba_cuda_spec is None: warnings.warn(no_spec_msg.format("numba_cuda")) self.initialized = False return False From 582b66ecf5ba14cac25991dee988cb29a0d43f88 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Sat, 30 Aug 2025 09:13:57 -0700 Subject: [PATCH 25/62] Shorten CI runs for development --- ci/matrix.yml | 36 ++++++++++++++++++------------------ ci/test_conda.sh | 1 + 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/ci/matrix.yml b/ci/matrix.yml index 247027305..e11ea7e12 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -1,24 +1,24 @@ build-matrix: - - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '11.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + # - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } simulator-matrix: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } test-matrix: - - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest' } - - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4', DRIVER: 'latest' } - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest' } + # - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest' } + # - { CUDA_VER: '11.8.0', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'l4', DRIVER: 'latest' } + # - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest' } - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest' } - - { CUDA_VER: '12.8.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest' } - - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest' } - - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' } - - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' } - - { CUDA_VER: '12.8.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest' } + # - { CUDA_VER: '12.8.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest' } + # - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest' } + # - { CUDA_VER: '11.8.0', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu20.04', GPU: 'a100', DRIVER: 'latest' } + # - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' } + # - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest' } + # - { CUDA_VER: '12.8.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest' } diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 07ea5dbf5..ad124c630 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -2,6 +2,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION set -euo pipefail +set -x . /opt/conda/etc/profile.d/conda.sh From 203e152f2567253390b40b93d1d4c8c1644c4fbf Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Sat, 30 Aug 2025 09:28:56 -0700 Subject: [PATCH 26/62] Debugging CI --- ci/test_conda.sh | 5 +++++ numba_cuda/numba/cuda/testing.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index ad124c630..958db3f5d 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -76,6 +76,11 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\" print(test_dir) " +python -c "import numba.cuda as cuda; print(cuda.__file__)" +python -c "import numba_cuda as cuda; print(cuda.__file__)" +pip freeze | grep numba +python -c "import numba.cuda.testing; print(numba.cuda.testing.test_binary_generation_dir())" + CUDA_VER_MAJOR_MINOR=${CUDA_VER%.*} if [ "${CUDA_VER_MAJOR_MINOR%.*}" == "11" ] then diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py index 91d1cf243..1d7866a99 100644 --- a/numba_cuda/numba/cuda/testing.py +++ b/numba_cuda/numba/cuda/testing.py @@ -348,3 +348,9 @@ class ForeignArray(object): def __init__(self, arr): self._arr = arr self.__cuda_array_interface__ = arr.__cuda_array_interface__ + + +def test_binary_generation_dir(): + testing = Path(__file__) + binary_gen_dir = testing.parent / "tests" / "test_binary_generation" + return binary_gen_dir From 12026f895353a38ea19698b0a6e832e317990381 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 2 Sep 2025 19:10:49 +0100 Subject: [PATCH 27/62] Use platform-dependent wheel path --- ci/test_wheel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index a6c474fde..bdc6b50b9 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -7,7 +7,7 @@ set -euo pipefail CUDA_VER_MAJOR_MINOR=${CUDA_VER%.*} rapids-logger "Install wheel with test dependencies" -package=$(realpath wheel/numba_cuda*.whl) +package=$(realpath wheel*/numba_cuda*.whl) echo "Package path: ${package}" DEPENDENCIES=( From 4dba8dbcca1264b91715e41864005a1b9e84696d Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 2 Sep 2025 19:29:37 +0100 Subject: [PATCH 28/62] Build modules in cext --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 4f959e043..cb5d4b4db 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ def get_ext_modules(): ] ext_devicearray = Extension( - name="numba_cuda._devicearray", + name="numba_cuda.numba.cuda.cext._devicearray", sources=["numba_cuda/numba/cuda/cext/_devicearray.cpp"], depends=[ "numba_cuda/numba/cuda/cext/_pymodule.h", @@ -55,7 +55,7 @@ def get_ext_modules(): install_name_tool_fixer = ["-headerpad_max_install_names"] ext_mviewbuf = Extension( - name="numba_cuda.mviewbuf", + name="numba_cuda.numba.cuda.cext.mviewbuf", extra_link_args=install_name_tool_fixer, sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"], ) @@ -67,7 +67,7 @@ def get_ext_modules(): "numba_cuda/numba/cuda/cext/typeconv.cpp", ] ext_dispatcher = Extension( - name="numba_cuda._dispatcher", + name="numba_cuda.numba.cuda.cext._dispatcher", sources=dispatcher_sources, depends=[ "numba_cuda/numba/cuda/cext/_pymodule.h", From 486dfae03c82dcfd6e685378f920907d61d1e672 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 2 Sep 2025 19:32:49 +0100 Subject: [PATCH 29/62] Don't use editable install in conda build --- conda/recipes/numba-cuda/meta.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index f792814bc..8016ca6d3 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -17,9 +17,8 @@ source: path: ../../.. build: - noarch: python script: - - {{ PYTHON }} -m pip install -e . -vv + - {{ PYTHON }} -m pip install . -vv requirements: host: From a93e465b8c1d9c908e26f00b11ae39574049a143 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 2 Sep 2025 19:33:35 +0100 Subject: [PATCH 30/62] Use correct search path for C extensions --- numba_cuda/numba/cuda/cext/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py index 4b1816572..c54155bae 100644 --- a/numba_cuda/numba/cuda/cext/__init__.py +++ b/numba_cuda/numba/cuda/cext/__init__.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + import sys import importlib import importlib.util @@ -67,7 +70,7 @@ def _load_cext_module( # 2) Fallback: scan sys.path for installed numba_cuda directory if spec is None: for entry in sys.path: - numba_cuda_dir = Path(entry) / "numba_cuda" + numba_cuda_dir = Path(entry) / "numba_cuda/numba/cuda/cext" spec = _find_in_dir(module_basename, numba_cuda_dir) if spec is not None: break From 9b59060e9f3977aeeaf2991b60153a01cc1bf969 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 2 Sep 2025 19:46:05 +0100 Subject: [PATCH 31/62] Remove debug prints --- ci/matrix.yml | 9 +++++++++ ci/test_conda.sh | 5 ----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/ci/matrix.yml b/ci/matrix.yml index 6cf2e4dac..7ab320d2f 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -5,7 +5,16 @@ # [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES] # build-matrix: + - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } simulator-matrix: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } # We test "oldest" dependencies with the oldest supported Python version and diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 321a75eeb..cf51a7522 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -98,11 +98,6 @@ test_dir = root + \"numba/cuda/tests/test_binary_generation/\" print(test_dir) " -python -c "import numba.cuda as cuda; print(cuda.__file__)" -python -c "import numba_cuda as cuda; print(cuda.__file__)" -pip freeze | grep numba -python -c "import numba.cuda.testing; print(numba.cuda.testing.test_binary_generation_dir())" - rapids-logger "Build tests" export NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$GET_TEST_BINARY_DIR") From 1658db671ba14e41697025b41c42f61fbe09d6e9 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 2 Sep 2025 20:05:05 +0100 Subject: [PATCH 32/62] COrrect conda-repo location --- ci/test_conda.sh | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index cf51a7522..b0e799d98 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -45,10 +45,25 @@ if [ "${RAPIDS_DEPENDENCIES:-}" = "oldest" ]; then DEPENDENCIES+=("numba==0.60.0") fi +# Detect system architecture to set conda repo path +ARCH=$(uname -m) +if [[ "$ARCH" == "x86_64" ]]; then + ARCH_SUFFIX="amd64" +elif [[ "$ARCH" == "aarch64" ]]; then + ARCH_SUFFIX="arm64" +else + echo "Unsupported architecture: $ARCH" + exit 1 +fi + +PY="${RAPIDS_PY_VER//./}" + +repo=`pwd`/conda-repo-py${PY}-${ARCH} + rapids-mamba-retry create \ -n test \ --strict-channel-priority \ - --channel "`pwd`/conda-repo" \ + --channel "$repo" \ --channel conda-forge \ "${DEPENDENCIES[@]}" @@ -59,18 +74,7 @@ set -u pip install filecheck -# Detect system architecture to set conda repo path -ARCH=$(uname -m) -if [[ "$ARCH" == "x86_64" ]]; then - ARCH_SUFFIX="amd64" -elif [[ "$ARCH" == "aarch64" ]]; then - ARCH_SUFFIX="arm64" -else - echo "Unsupported architecture: $ARCH" - exit 1 -fi - -rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda +rapids-mamba-retry install -c ${repo} numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" From 6349ad72dc9526a4482affc6647b6a61e14447d4 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 10:50:13 +0100 Subject: [PATCH 33/62] Again try to correct conda repo location --- ci/test_conda.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index b0e799d98..94feef921 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -56,9 +56,7 @@ else exit 1 fi -PY="${RAPIDS_PY_VER//./}" - -repo=`pwd`/conda-repo-py${PY}-${ARCH} +repo=`pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH} rapids-mamba-retry create \ -n test \ From 727e6f8fa6bc5a2bf03c162dba1381422930b44b Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 10:59:18 +0100 Subject: [PATCH 34/62] Correct usage of `download-artifact` The artiface name should be given as `with.name`, not just `name` - `name` just names the step. --- .github/workflows/conda-python-tests.yaml | 3 ++- .github/workflows/docs-build.yaml | 3 ++- .github/workflows/simulator-test.yaml | 3 ++- .github/workflows/wheel-windows-tests.yaml | 3 ++- .github/workflows/wheels-test.yaml | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml index fcedcf17c..629b4d0fd 100644 --- a/.github/workflows/conda-python-tests.yaml +++ b/.github/workflows/conda-python-tests.yaml @@ -97,7 +97,8 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + with: + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml index 87f6bc58d..945766078 100644 --- a/.github/workflows/docs-build.yaml +++ b/.github/workflows/docs-build.yaml @@ -59,7 +59,8 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + with: + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml index 78f3e85b8..d76dca639 100644 --- a/.github/workflows/simulator-test.yaml +++ b/.github/workflows/simulator-test.yaml @@ -86,7 +86,8 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + with: + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml index 1847ccf7e..dd7ce65cf 100644 --- a/.github/workflows/wheel-windows-tests.yaml +++ b/.github/workflows/wheel-windows-tests.yaml @@ -49,7 +49,8 @@ jobs: run: nvidia-smi - uses: actions/download-artifact@v4 - name: wheel + with: + name: wheel - name: Display structure of downloaded files run: Get-ChildItem -Recurse diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml index 4fd38c889..4c6f5564a 100644 --- a/.github/workflows/wheels-test.yaml +++ b/.github/workflows/wheels-test.yaml @@ -115,7 +115,8 @@ jobs: fetch-depth: 0 # unshallow fetch for setuptools-scm persist-credentials: false - uses: actions/download-artifact@v4 - name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + with: + name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} - name: Display structure of downloaded files run: ls -R - name: Standardize repository information From dc1bbb83805c8fda2d953824fa7637f8323a9953 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 12:12:32 +0100 Subject: [PATCH 35/62] Revert "Again try to correct conda repo location" This reverts commit 6349ad72dc9526a4482affc6647b6a61e14447d4. --- ci/test_conda.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 94feef921..b0e799d98 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -56,7 +56,9 @@ else exit 1 fi -repo=`pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH} +PY="${RAPIDS_PY_VER//./}" + +repo=`pwd`/conda-repo-py${PY}-${ARCH} rapids-mamba-retry create \ -n test \ From fc3c577b3a429e8ddb0d83a4320379d0191779f1 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 12:12:38 +0100 Subject: [PATCH 36/62] Revert "COrrect conda-repo location" This reverts commit 1658db671ba14e41697025b41c42f61fbe09d6e9. --- ci/test_conda.sh | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index b0e799d98..cf51a7522 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -45,25 +45,10 @@ if [ "${RAPIDS_DEPENDENCIES:-}" = "oldest" ]; then DEPENDENCIES+=("numba==0.60.0") fi -# Detect system architecture to set conda repo path -ARCH=$(uname -m) -if [[ "$ARCH" == "x86_64" ]]; then - ARCH_SUFFIX="amd64" -elif [[ "$ARCH" == "aarch64" ]]; then - ARCH_SUFFIX="arm64" -else - echo "Unsupported architecture: $ARCH" - exit 1 -fi - -PY="${RAPIDS_PY_VER//./}" - -repo=`pwd`/conda-repo-py${PY}-${ARCH} - rapids-mamba-retry create \ -n test \ --strict-channel-priority \ - --channel "$repo" \ + --channel "`pwd`/conda-repo" \ --channel conda-forge \ "${DEPENDENCIES[@]}" @@ -74,7 +59,18 @@ set -u pip install filecheck -rapids-mamba-retry install -c ${repo} numba-cuda +# Detect system architecture to set conda repo path +ARCH=$(uname -m) +if [[ "$ARCH" == "x86_64" ]]; then + ARCH_SUFFIX="amd64" +elif [[ "$ARCH" == "aarch64" ]]; then + ARCH_SUFFIX="arm64" +else + echo "Unsupported architecture: $ARCH" + exit 1 +fi + +rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" From 1e507b54ef25004d676d62530405da6cccc33e7c Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 12:17:31 +0100 Subject: [PATCH 37/62] Another attempt to fix artifact locations / paths --- .github/workflows/conda-python-tests.yaml | 1 + .github/workflows/docs-build.yaml | 1 + .github/workflows/simulator-test.yaml | 1 + .github/workflows/wheels-test.yaml | 1 + ci/build_docs.sh | 13 +------------ ci/test_conda.sh | 2 +- ci/test_conda_ctypes_binding.sh | 13 +------------ ci/test_simulator.sh | 13 +------------ ci/test_wheel.sh | 2 +- 9 files changed, 9 insertions(+), 38 deletions(-) diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml index 629b4d0fd..c438ebbb8 100644 --- a/.github/workflows/conda-python-tests.yaml +++ b/.github/workflows/conda-python-tests.yaml @@ -99,6 +99,7 @@ jobs: - uses: actions/download-artifact@v4 with: name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + path: conda-repo - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml index 945766078..9dcae3bbe 100644 --- a/.github/workflows/docs-build.yaml +++ b/.github/workflows/docs-build.yaml @@ -61,6 +61,7 @@ jobs: - uses: actions/download-artifact@v4 with: name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + path: conda-repo - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml index d76dca639..fe306bc26 100644 --- a/.github/workflows/simulator-test.yaml +++ b/.github/workflows/simulator-test.yaml @@ -88,6 +88,7 @@ jobs: - uses: actions/download-artifact@v4 with: name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + path: conda-repo - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml index 4c6f5564a..afa05076f 100644 --- a/.github/workflows/wheels-test.yaml +++ b/.github/workflows/wheels-test.yaml @@ -117,6 +117,7 @@ jobs: - uses: actions/download-artifact@v4 with: name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + path: wheel - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/ci/build_docs.sh b/ci/build_docs.sh index ed1d087cf..b4a791446 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -29,18 +29,7 @@ set +u conda activate docs set -u -# Detect system architecture to set conda repo path -ARCH=$(uname -m) -if [[ "$ARCH" == "x86_64" ]]; then - ARCH_SUFFIX="amd64" -elif [[ "$ARCH" == "aarch64" ]]; then - ARCH_SUFFIX="arm64" -else - echo "Unsupported architecture: $ARCH" - exit 1 -fi - -rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda +rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda pip install nvidia-sphinx-theme diff --git a/ci/test_conda.sh b/ci/test_conda.sh index cf51a7522..47fb3d01f 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -70,7 +70,7 @@ else exit 1 fi -rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda +rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh index 835d33b38..5faaddedb 100755 --- a/ci/test_conda_ctypes_binding.sh +++ b/ci/test_conda_ctypes_binding.sh @@ -44,18 +44,7 @@ set -u pip install filecheck -# Detect system architecture to set conda repo path -ARCH=$(uname -m) -if [[ "$ARCH" == "x86_64" ]]; then - ARCH_SUFFIX="amd64" -elif [[ "$ARCH" == "aarch64" ]]; then - ARCH_SUFFIX="arm64" -else - echo "Unsupported architecture: $ARCH" - exit 1 -fi - -rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda +rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh index a17618939..c11e2f5b7 100755 --- a/ci/test_simulator.sh +++ b/ci/test_simulator.sh @@ -31,18 +31,7 @@ set -u pip install filecheck -# Detect system architecture to set conda repo path -ARCH=$(uname -m) -if [[ "$ARCH" == "x86_64" ]]; then - ARCH_SUFFIX="amd64" -elif [[ "$ARCH" == "aarch64" ]]; then - ARCH_SUFFIX="arm64" -else - echo "Unsupported architecture: $ARCH" - exit 1 -fi - -rapids-mamba-retry install -c `pwd`/conda-repo-py${RAPIDS_PY_VERSION}-${ARCH_SUFFIX} numba-cuda +rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index bdc6b50b9..a6c474fde 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -7,7 +7,7 @@ set -euo pipefail CUDA_VER_MAJOR_MINOR=${CUDA_VER%.*} rapids-logger "Install wheel with test dependencies" -package=$(realpath wheel*/numba_cuda*.whl) +package=$(realpath wheel/numba_cuda*.whl) echo "Package path: ${package}" DEPENDENCIES=( From 42654425a885eeb6c410f28df185ba3678b8b4b3 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 13:05:32 +0100 Subject: [PATCH 38/62] Remove Python 3.9 from matrix --- ci/matrix.yml | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/ci/matrix.yml b/ci/matrix.yml index 7ab320d2f..f810738c1 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -5,12 +5,10 @@ # [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES] # build-matrix: - - { ARCH: 'amd64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - - { ARCH: 'arm64', PY_VER: '3.9', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } @@ -20,17 +18,17 @@ simulator-matrix: # We test "oldest" dependencies with the oldest supported Python version and # the second-newest Python version. test-matrix: - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } + - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } + - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } From 112f3c9ab96374336c45e3f6808e4069207d41d2 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 13:07:27 +0100 Subject: [PATCH 39/62] Fix up pre-commit violations --- numba_cuda/numba/cuda/cext/_devicearray.cpp | 3 +++ numba_cuda/numba/cuda/cext/_devicearray.h | 3 +++ numba_cuda/numba/cuda/cext/_dispatcher.cpp | 3 +++ numba_cuda/numba/cuda/cext/_hashtable.cpp | 3 +++ numba_cuda/numba/cuda/cext/_hashtable.h | 3 +++ numba_cuda/numba/cuda/cext/_pymodule.h | 3 +++ numba_cuda/numba/cuda/cext/_typeof.cpp | 3 +++ numba_cuda/numba/cuda/cext/_typeof.h | 3 +++ numba_cuda/numba/cuda/cext/typeconv.cpp | 3 +++ 9 files changed, 27 insertions(+) diff --git a/numba_cuda/numba/cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp index cdfc73595..60e11a69f 100644 --- a/numba_cuda/numba/cuda/cext/_devicearray.cpp +++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + /* This file contains the base class implementation for all device arrays. The * base class is implemented in C so that computing typecodes for device arrays * can be implemented efficiently. */ diff --git a/numba_cuda/numba/cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h index ab469553a..e16726983 100644 --- a/numba_cuda/numba/cuda/cext/_devicearray.h +++ b/numba_cuda/numba/cuda/cext/_devicearray.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #ifndef NUMBA_DEVICEARRAY_H_ #define NUMBA_DEVICEARRAY_H_ diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp index fc3757a10..4f5d20b6c 100644 --- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp +++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #include "_pymodule.h" #include diff --git a/numba_cuda/numba/cuda/cext/_hashtable.cpp b/numba_cuda/numba/cuda/cext/_hashtable.cpp index d926256d3..7d5dda962 100644 --- a/numba_cuda/numba/cuda/cext/_hashtable.cpp +++ b/numba_cuda/numba/cuda/cext/_hashtable.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + /* * This file and _hashtable.h are from CPython 3.5. The symbols have been * renamed from _Py_hashxxx to _Numba_hashxxx to avoid name clashes with diff --git a/numba_cuda/numba/cuda/cext/_hashtable.h b/numba_cuda/numba/cuda/cext/_hashtable.h index fbc6d6013..9fb719472 100644 --- a/numba_cuda/numba/cuda/cext/_hashtable.h +++ b/numba_cuda/numba/cuda/cext/_hashtable.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + /* * See _hashtable.c for more information about this file. */ diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h index 47eedcb8a..cff21ebe7 100644 --- a/numba_cuda/numba/cuda/cext/_pymodule.h +++ b/numba_cuda/numba/cuda/cext/_pymodule.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #ifndef NUMBA_PY_MODULE_H_ #define NUMBA_PY_MODULE_H_ diff --git a/numba_cuda/numba/cuda/cext/_typeof.cpp b/numba_cuda/numba/cuda/cext/_typeof.cpp index 4f03d0070..6730b6d6f 100644 --- a/numba_cuda/numba/cuda/cext/_typeof.cpp +++ b/numba_cuda/numba/cuda/cext/_typeof.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #include "_pymodule.h" #include diff --git a/numba_cuda/numba/cuda/cext/_typeof.h b/numba_cuda/numba/cuda/cext/_typeof.h index 6e0039b5f..54e39ff80 100644 --- a/numba_cuda/numba/cuda/cext/_typeof.h +++ b/numba_cuda/numba/cuda/cext/_typeof.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #ifndef NUMBA_TYPEOF_H_ #define NUMBA_TYPEOF_H_ diff --git a/numba_cuda/numba/cuda/cext/typeconv.cpp b/numba_cuda/numba/cuda/cext/typeconv.cpp index 3c51fdfa7..5af7d16b3 100644 --- a/numba_cuda/numba/cuda/cext/typeconv.cpp +++ b/numba_cuda/numba/cuda/cext/typeconv.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #include #include #include From 0cf1ca8ca8110ddea538336ba49b7859219a2d2f Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 13:08:55 +0100 Subject: [PATCH 40/62] Fix up pre-commit violations --- numba_cuda/numba/cuda/cext/mviewbuf.c | 3 +++ numba_cuda/numba/cuda/cext/typeconv.hpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/numba_cuda/numba/cuda/cext/mviewbuf.c b/numba_cuda/numba/cuda/cext/mviewbuf.c index 33f0888c7..eb5075d0b 100644 --- a/numba_cuda/numba/cuda/cext/mviewbuf.c +++ b/numba_cuda/numba/cuda/cext/mviewbuf.c @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #include "_pymodule.h" static int get_writable_buffer(PyObject* obj, Py_buffer *buf, int force) diff --git a/numba_cuda/numba/cuda/cext/typeconv.hpp b/numba_cuda/numba/cuda/cext/typeconv.hpp index 1f3cb9359..da5d87a77 100644 --- a/numba_cuda/numba/cuda/cext/typeconv.hpp +++ b/numba_cuda/numba/cuda/cext/typeconv.hpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + #ifndef NUMBA_TYPECONV_HPP_ #define NUMBA_TYPECONV_HPP_ #include From e7c5c9179b6ce9ad4f82b0e4972c61a9b8921230 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 13:20:04 +0100 Subject: [PATCH 41/62] Attempt to fix docs build repo --- .github/workflows/docs-build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml index 9dcae3bbe..83c78fe8d 100644 --- a/.github/workflows/docs-build.yaml +++ b/.github/workflows/docs-build.yaml @@ -49,7 +49,7 @@ jobs: env: RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts container: - image: rapidsai/ci-conda:latest + image: rapidsai/ci-conda:cuda13.0.0-ubuntu24.04-py3.13 env: RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} steps: @@ -60,7 +60,7 @@ jobs: fetch-depth: 0 - uses: actions/download-artifact@v4 with: - name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + name: conda-repo-py3.13-amd64 path: conda-repo - name: Display structure of downloaded files run: ls -R From f3948a079c44d13a5e99e7e2bad9d2e203bbc679 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 14:02:53 +0100 Subject: [PATCH 42/62] Fix matrix for coverage report --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index da52b2d39..92b67c5dc 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -169,4 +169,4 @@ jobs: build_type: pull-request script: "ci/coverage_report.sh" matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.11")) | .[0:1]' + matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.12")) | .[0:1]' From 199439796f96d359761d9b0ffccd67b6338debc4 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 14:38:11 +0100 Subject: [PATCH 43/62] Build wheel on Windows --- .github/workflows/pr.yaml | 5 +- .github/workflows/wheel-windows-build.yaml | 58 ++++++++++++++++++++++ .github/workflows/wheel-windows-tests.yaml | 3 +- ci/build_wheel.ps1 | 33 ++++++++++++ 4 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/wheel-windows-build.yaml create mode 100644 ci/build_wheel.ps1 diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 92b67c5dc..701725134 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -22,6 +22,7 @@ jobs: - test-conda-ctypes-binding - test-simulator - build-wheels + - build-wheels-windows - test-wheels-windows - test-wheels - test-wheels-ctypes-binding @@ -108,9 +109,11 @@ jobs: build_type: pull-request script: "ci/build_wheel.sh" matrix: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }} + build-wheels-windows: + uses: ./.github/workflows/wheel-windows-build.yaml test-wheels-windows: needs: - - build-wheels + - build-wheels-windows - compute-matrix uses: ./.github/workflows/wheel-windows-tests.yaml test-wheels: diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml new file mode 100644 index 000000000..9bbc7cd2e --- /dev/null +++ b/.github/workflows/wheel-windows-build.yaml @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + +name: "CI: Build wheel on Windows" + +on: + workflow_call: + inputs: + sha: + type: string + repo: + type: string + script: + type: string + default: "./ci/test_wheel.ps1" + +jobs: + compute-matrix: + runs-on: ubuntu-latest + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + steps: + - name: Compute Python Test Matrix + id: compute-matrix + run: | + set -eo pipefail + export TEST_MATRIX="{ ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0' }" + MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c) + echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" + build: + name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, windows + needs: compute-matrix + strategy: + fail-fast: false + matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }} + runs-on: "cuda-python-windows-gpu-github" + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 + - name: Set up Python ${{ matrix.PY_VER }} + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: ${{ matrix.PY_VER }} + + - name: Python build + run: | + ${{ inputs.script }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + - name: Upload wheel + if: "!cancelled()" + uses: actions/upload-artifact@v4 + with: + name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}-windows + path: ${{ env.wheel_path }} diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml index dd7ce65cf..fa53ffc5b 100644 --- a/.github/workflows/wheel-windows-tests.yaml +++ b/.github/workflows/wheel-windows-tests.yaml @@ -50,7 +50,8 @@ jobs: - uses: actions/download-artifact@v4 with: - name: wheel + name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}-windows + path: wheel - name: Display structure of downloaded files run: Get-ChildItem -Recurse diff --git a/ci/build_wheel.ps1 b/ci/build_wheel.ps1 new file mode 100644 index 000000000..24e27aa62 --- /dev/null +++ b/ci/build_wheel.ps1 @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + +$ErrorActionPreference = 'Stop' +Set-StrictMode -Version Latest + +function rapids-logger { + param ( + [Parameter(Mandatory=$true)] + [string]$Text + ) + + # Determine padding and box width + $padding = 2 + $boxWidth = $Text.Length + ($padding * 2) + $topBottom = '+' + ('-' * $boxWidth) + '+' + $middle = '|' + (' ' * $padding) + $Text + (' ' * $padding) + '|' + + # Print the box in green + Write-Host $topBottom -ForegroundColor Green + Write-Host $middle -ForegroundColor Green + Write-Host $topBottom -ForegroundColor Green +} + +rapids-logger "Install build package" +python -m pip install build + +rapids-logger "Build sdist and wheel" +python -m build . + +$wheel_path = Resolve-Path dist\numba_cuda*.whl | Select-Object -ExpandProperty Path +echo "Wheel path: $wheel_path" +echo "wheel_path=$wheel_path" >> $GITHUB_ENV From faed1784aed5ed6d3eb816df08cf474d1cdfa2cb Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 14:44:45 +0100 Subject: [PATCH 44/62] Use correct script for wheel Windows build --- .github/workflows/wheel-windows-build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index 9bbc7cd2e..53c00a1ca 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -12,7 +12,7 @@ on: type: string script: type: string - default: "./ci/test_wheel.ps1" + default: "./ci/build_wheel.ps1" jobs: compute-matrix: From 9d0ca3a90b22d282bb13a7039b936a9a8073b2fa Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 14:51:20 +0100 Subject: [PATCH 45/62] Add MSVC to Windows build --- .github/workflows/wheel-windows-build.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index 53c00a1ca..2c724bea2 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -40,16 +40,21 @@ jobs: repository: ${{ inputs.repo }} ref: ${{ inputs.sha }} fetch-depth: 0 + - name: Set up Python ${{ matrix.PY_VER }} uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: ${{ matrix.PY_VER }} + - name: Set up MSVC + uses: ilammy/msvc-dev-cmd@v1 # TODO: ask admin to allow pinning commits + - name: Python build run: | ${{ inputs.script }} env: CUDA_VER: ${{ matrix.CUDA_VER }} + - name: Upload wheel if: "!cancelled()" uses: actions/upload-artifact@v4 From d9cbd9ec3f8b2aa021dfbd1138006bc1c29dbc09 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 15:01:30 +0100 Subject: [PATCH 46/62] Use a different runner for Windows build --- .github/workflows/wheel-windows-build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index 2c724bea2..fbf52a634 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -33,7 +33,7 @@ jobs: strategy: fail-fast: false matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }} - runs-on: "cuda-python-windows-gpu-github" + runs-on: windows-2022 steps: - uses: actions/checkout@v4 with: From 770813c41fa2c8ef27d4c7532b0f097419a5be84 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 17:10:07 +0100 Subject: [PATCH 47/62] Try using env var for GITHUB_ENV in Windows wheel build --- ci/build_wheel.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build_wheel.ps1 b/ci/build_wheel.ps1 index 24e27aa62..77608ac54 100644 --- a/ci/build_wheel.ps1 +++ b/ci/build_wheel.ps1 @@ -30,4 +30,4 @@ python -m build . $wheel_path = Resolve-Path dist\numba_cuda*.whl | Select-Object -ExpandProperty Path echo "Wheel path: $wheel_path" -echo "wheel_path=$wheel_path" >> $GITHUB_ENV +echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV From 180c9a45e13a384ce5f41b4c3127d0544d55003a Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 17:36:10 +0100 Subject: [PATCH 48/62] Attempt to have Python matrix for Windows build --- .github/workflows/pr.yaml | 10 ++++++++++ .github/workflows/wheel-windows-build.yaml | 10 ++++------ .github/workflows/wheel-windows-tests.yaml | 9 ++++----- ci/matrix.yml | 5 +++++ 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 701725134..4e8182be6 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -49,12 +49,16 @@ jobs: id: compute-matrix run: | BUILD_MATRIX="$(yq '.build-matrix' ci/matrix.yml)" + WINDOWS_MATRIX="$(yq '.windows-matrix' ci/matrix.yml)" SIMULATOR_MATRIX="$(yq '.simulator-matrix' ci/matrix.yml)" TEST_MATRIX="$(yq '.test-matrix' ci/matrix.yml)" { echo 'BUILD_MATRIX< Date: Wed, 3 Sep 2025 20:16:26 +0100 Subject: [PATCH 49/62] Add missing output in compute-matrix --- .github/workflows/pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 4e8182be6..1c6df6a3d 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -41,6 +41,7 @@ jobs: runs-on: ubuntu-latest outputs: BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }} + WINDOWS_MATRIX: ${{ steps.compute-matrix.outputs.WINDOWS_MATRIX }} SIMULATOR_MATRIX: ${{ steps.compute-matrix.outputs.SIMULATOR_MATRIX }} TEST_MATRIX: ${{ steps.compute-matrix.outputs.TEST_MATRIX }} steps: From bef9816c67b70b9252530d7af85f9012d5137fda Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 20:32:57 +0100 Subject: [PATCH 50/62] Another attempt to fix things --- .github/workflows/wheel-windows-build.yaml | 10 ++++++++++ .github/workflows/wheel-windows-tests.yaml | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index 22ad66402..16c6027a5 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -14,6 +14,15 @@ on: type: string default: "./ci/build_wheel.ps1" + # general settings + matrix: + type: string + required: true + matrix_filter: + type: string + default: "." + + jobs: compute-matrix: runs-on: ubuntu-latest @@ -30,6 +39,7 @@ jobs: name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows needs: compute-matrix strategy: + fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} runs-on: windows-2022 steps: diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml index 577a3f63a..e7e627007 100644 --- a/.github/workflows/wheel-windows-tests.yaml +++ b/.github/workflows/wheel-windows-tests.yaml @@ -14,6 +14,14 @@ on: type: string default: "./ci/test_wheel.ps1" + # general settings + matrix: + type: string + required: true + matrix_filter: + type: string + default: "." + jobs: compute-matrix: runs-on: ubuntu-latest From 145767c051a1b5380a5f55f908ad1c2d0db996a4 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 3 Sep 2025 20:44:29 +0100 Subject: [PATCH 51/62] Another attempt at matrix fix --- .github/workflows/wheel-windows-build.yaml | 3 ++- .github/workflows/wheel-windows-tests.yaml | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index 16c6027a5..ce6199328 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -22,7 +22,6 @@ on: type: string default: "." - jobs: compute-matrix: runs-on: ubuntu-latest @@ -31,6 +30,8 @@ jobs: steps: - name: Compute Python Test Matrix id: compute-matrix + env: + MATRIX: ${{ inputs.matrix }} run: | set -eo pipefail MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c) diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml index e7e627007..89cdace2c 100644 --- a/.github/workflows/wheel-windows-tests.yaml +++ b/.github/workflows/wheel-windows-tests.yaml @@ -30,6 +30,8 @@ jobs: steps: - name: Compute Python Test Matrix id: compute-matrix + env: + MATRIX: ${{ inputs.matrix }} run: | set -eo pipefail MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c) From 3ecc2ac48e990e506dac89d95e8c1e6a92a79b74 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Sep 2025 10:17:52 +0100 Subject: [PATCH 52/62] Fix potential issue in yaml files --- .github/workflows/wheel-windows-build.yaml | 3 ++- .github/workflows/wheel-windows-tests.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index ce6199328..49a60e8f0 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -38,7 +38,8 @@ jobs: echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" build: name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows - needs: compute-matrix + needs: + - compute-matrix strategy: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml index 89cdace2c..ca37c1059 100644 --- a/.github/workflows/wheel-windows-tests.yaml +++ b/.github/workflows/wheel-windows-tests.yaml @@ -38,7 +38,8 @@ jobs: echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" tests: name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows - needs: compute-matrix + needs: + - compute-matrix strategy: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} From b7eac0c026faed71c1e5cff8eb2f2f94932d39ba Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Sep 2025 10:37:40 +0100 Subject: [PATCH 53/62] Experiment if issue is with matrix --- .github/workflows/wheel-windows-build.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index 49a60e8f0..2f031a7f2 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -28,7 +28,7 @@ jobs: outputs: MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} steps: - - name: Compute Python Test Matrix + - name: Compute Build Matrix id: compute-matrix env: MATRIX: ${{ inputs.matrix }} @@ -42,7 +42,11 @@ jobs: - compute-matrix strategy: fail-fast: false - matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + matrix: + - {"PY_VER": "3.10", "CUDA_VER": "12.8.0"} + - {"PY_VER": "3.11", "CUDA_VER": "12.8.0"} + - {"PY_VER": "3.12", "CUDA_VER": "12.8.0"} + - {"PY_VER": "3.13", "CUDA_VER": "12.8.0"} runs-on: windows-2022 steps: - uses: actions/checkout@v4 From af752941dfc9d95d9a1d6004ef4d47981a53181f Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Sep 2025 10:44:46 +0100 Subject: [PATCH 54/62] Does this fix it? --- .github/workflows/wheel-windows-build.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml index 2f031a7f2..42ac8c702 100644 --- a/.github/workflows/wheel-windows-build.yaml +++ b/.github/workflows/wheel-windows-build.yaml @@ -37,16 +37,17 @@ jobs: MATRIX=$(yq -n -o json 'env(MATRIX)' | jq -c) echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" build: - name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, windows + name: ${{ matrix.config.CUDA_VER }}, ${{ matrix.config.PY_VER }}, windows needs: - compute-matrix strategy: fail-fast: false matrix: - - {"PY_VER": "3.10", "CUDA_VER": "12.8.0"} - - {"PY_VER": "3.11", "CUDA_VER": "12.8.0"} - - {"PY_VER": "3.12", "CUDA_VER": "12.8.0"} - - {"PY_VER": "3.13", "CUDA_VER": "12.8.0"} + config: + - {"PY_VER": "3.10", "CUDA_VER": "12.8.0"} + - {"PY_VER": "3.11", "CUDA_VER": "12.8.0"} + - {"PY_VER": "3.12", "CUDA_VER": "12.8.0"} + - {"PY_VER": "3.13", "CUDA_VER": "12.8.0"} runs-on: windows-2022 steps: - uses: actions/checkout@v4 @@ -55,10 +56,10 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - - name: Set up Python ${{ matrix.PY_VER }} + - name: Set up Python ${{ matrix.config.PY_VER }} uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: ${{ matrix.PY_VER }} + python-version: ${{ matrix.config.PY_VER }} - name: Set up MSVC uses: ilammy/msvc-dev-cmd@v1 # TODO: ask admin to allow pinning commits @@ -67,11 +68,11 @@ jobs: run: | ${{ inputs.script }} env: - CUDA_VER: ${{ matrix.CUDA_VER }} + CUDA_VER: ${{ matrix.config.CUDA_VER }} - name: Upload wheel if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: wheel-py${{ matrix.PY_VER }}-windows + name: wheel-py${{ matrix.config.PY_VER }}-windows path: ${{ env.wheel_path }} From 9157012b04534c126fe6f0c144a7ab4703fbc80b Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Sep 2025 10:53:43 +0100 Subject: [PATCH 55/62] Attempt to simplify Windows matrix setup --- .github/workflows/pr.yaml | 12 ------ .github/workflows/wheel-windows-build.yaml | 43 +++++----------------- .github/workflows/wheel-windows-tests.yaml | 34 ++++------------- ci/matrix.yml | 5 --- 4 files changed, 18 insertions(+), 76 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 1c6df6a3d..b2b5a2a6a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -41,7 +41,6 @@ jobs: runs-on: ubuntu-latest outputs: BUILD_MATRIX: ${{ steps.compute-matrix.outputs.BUILD_MATRIX }} - WINDOWS_MATRIX: ${{ steps.compute-matrix.outputs.WINDOWS_MATRIX }} SIMULATOR_MATRIX: ${{ steps.compute-matrix.outputs.SIMULATOR_MATRIX }} TEST_MATRIX: ${{ steps.compute-matrix.outputs.TEST_MATRIX }} steps: @@ -50,16 +49,12 @@ jobs: id: compute-matrix run: | BUILD_MATRIX="$(yq '.build-matrix' ci/matrix.yml)" - WINDOWS_MATRIX="$(yq '.windows-matrix' ci/matrix.yml)" SIMULATOR_MATRIX="$(yq '.simulator-matrix' ci/matrix.yml)" TEST_MATRIX="$(yq '.test-matrix' ci/matrix.yml)" { echo 'BUILD_MATRIX< Date: Thu, 4 Sep 2025 12:16:54 +0100 Subject: [PATCH 56/62] Remove accidental duplicate install of numba-cuda in docs build --- ci/build_docs.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index b4a791446..f5739b556 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -29,8 +29,6 @@ set +u conda activate docs set -u -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda - pip install nvidia-sphinx-theme rapids-print-env From 300d90c37fe77edd7d2193a5a9366abe768b17c8 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Sep 2025 12:19:57 +0100 Subject: [PATCH 57/62] Remove some duplicate installation steps --- ci/test_conda.sh | 16 ---------------- ci/test_conda_ctypes_binding.sh | 2 -- ci/test_simulator.sh | 2 -- 3 files changed, 20 deletions(-) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 47fb3d01f..ed2d57cef 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-2-Clause set -euo pipefail -set -x . /opt/conda/etc/profile.d/conda.sh @@ -59,19 +58,6 @@ set -u pip install filecheck -# Detect system architecture to set conda repo path -ARCH=$(uname -m) -if [[ "$ARCH" == "x86_64" ]]; then - ARCH_SUFFIX="amd64" -elif [[ "$ARCH" == "aarch64" ]]; then - ARCH_SUFFIX="arm64" -else - echo "Unsupported architecture: $ARCH" - exit 1 -fi - -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda - RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" pushd "${RAPIDS_TESTS_DIR}" @@ -99,13 +85,11 @@ print(test_dir) " rapids-logger "Build tests" - export NUMBA_CUDA_TEST_BIN_DIR=$(python -c "$GET_TEST_BINARY_DIR") pushd $NUMBA_CUDA_TEST_BIN_DIR make popd - rapids-logger "Run Tests" pytest --pyargs numba.cuda.tests -v diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh index 5faaddedb..844b35b40 100755 --- a/ci/test_conda_ctypes_binding.sh +++ b/ci/test_conda_ctypes_binding.sh @@ -44,8 +44,6 @@ set -u pip install filecheck -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda - RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" pushd "${RAPIDS_TESTS_DIR}" diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh index c11e2f5b7..bb85a8733 100755 --- a/ci/test_simulator.sh +++ b/ci/test_simulator.sh @@ -31,8 +31,6 @@ set -u pip install filecheck -rapids-mamba-retry install -c `pwd`/conda-repo numba-cuda - RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" pushd "${RAPIDS_TESTS_DIR}" From d27dd77a9961ee710b39731b4b7eb5103a081fe4 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Sep 2025 12:20:24 +0100 Subject: [PATCH 58/62] Remove unnecessary NumPy version pins --- conda/recipes/numba-cuda/meta.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index 8016ca6d3..b03309822 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -25,8 +25,7 @@ requirements: - python - pip - setuptools - - numpy >=2.1.0 # [py >= 310] - - numpy ==2.0.2 # [py == 39] + - numpy >=2.1.0 run: - python - numba >=0.59.1 From aa4ecc3bf8da2f0947119d9af33ef2427a9d5508 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Sep 2025 12:26:53 +0100 Subject: [PATCH 59/62] Remove debugging code --- numba_cuda/numba/cuda/testing.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py index f3700593c..373a66476 100644 --- a/numba_cuda/numba/cuda/testing.py +++ b/numba_cuda/numba/cuda/testing.py @@ -336,9 +336,3 @@ class ForeignArray(object): def __init__(self, arr): self._arr = arr self.__cuda_array_interface__ = arr.__cuda_array_interface__ - - -def test_binary_generation_dir(): - testing = Path(__file__) - binary_gen_dir = testing.parent / "tests" / "test_binary_generation" - return binary_gen_dir From bbe7b3cd51d0baf1143312cf301d76c6ef5882ac Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Fri, 5 Sep 2025 10:43:33 -0500 Subject: [PATCH 60/62] Minor changes to address review feedback Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com> --- numba_cuda/numba/cuda/cext/_dispatcher.cpp | 5 +++++ numba_cuda/numba/cuda/cext/_pymodule.h | 2 +- numba_cuda/numba/cuda/dispatcher.py | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp index 4f5d20b6c..bfd3c6518 100644 --- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp +++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp @@ -43,6 +43,11 @@ #endif #undef _PyGC_FINALIZED +/* dynamic_annotations.h is needed for building Python with --with-valgrind + * support. The following include is to workaround issues described in + * https://github.com/numba/numba/pull/10073 + */ +#include "dynamic_annotations.h" #if (PY_MINOR_VERSION == 12) #include "internal/pycore_atomic.h" #endif diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h index cff21ebe7..4431dfff2 100644 --- a/numba_cuda/numba/cuda/cext/_pymodule.h +++ b/numba_cuda/numba/cuda/cext/_pymodule.h @@ -33,6 +33,6 @@ Py_DECREF(tmp); } while (0) -#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 9) || (PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) +#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) #endif /* NUMBA_PY_MODULE_H_ */ diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index be2674d67..ad3b57a85 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -1012,7 +1012,7 @@ def error_rewrite(e, issue_type): val = arg.value if isinstance(arg, OmittedArg) else arg try: tp = typeof(val, Purpose.argument) - except ValueError as typeof_exc: + except (errors.NumbaValueError, ValueError) as typeof_exc: failed_args.append((i, str(typeof_exc))) else: if tp is None: @@ -1306,7 +1306,7 @@ def typeof_pyval(self, val): """ try: tp = typeof(val, Purpose.argument) - except ValueError: + except (errors.NumbaValueError, ValueError): tp = types.pyobject else: if tp is None: From 8acf0dcdc50a5d98e0753cd447584a8927058d61 Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Thu, 25 Sep 2025 14:43:21 -0700 Subject: [PATCH 61/62] Fix formatting in _dispatcher.cpp --- numba_cuda/numba/cuda/cext/_dispatcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp index bfd3c6518..9b0b8f0dc 100644 --- a/numba_cuda/numba/cuda/cext/_dispatcher.cpp +++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp @@ -43,7 +43,7 @@ #endif #undef _PyGC_FINALIZED -/* dynamic_annotations.h is needed for building Python with --with-valgrind +/* dynamic_annotations.h is needed for building Python with --with-valgrind * support. The following include is to workaround issues described in * https://github.com/numba/numba/pull/10073 */ From 3db9ae85f090c791c924f24821ed1bf641fd096a Mon Sep 17 00:00:00 2001 From: Vijay Kandiah Date: Wed, 1 Oct 2025 20:58:31 -0700 Subject: [PATCH 62/62] Add warning about symbol conflicts if RTLD_GLOBAL is set for linux --- numba_cuda/numba/cuda/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py index 2c82b70d8..db14c30c9 100644 --- a/numba_cuda/numba/cuda/__init__.py +++ b/numba_cuda/numba/cuda/__init__.py @@ -5,6 +5,7 @@ from numba.cuda.core import config from .utils import _readenv import warnings +import sys # Enable pynvjitlink based on the following precedence: @@ -96,3 +97,13 @@ if numba_cuda_default_ptx_cc > config_default_cc: config.CUDA_DEFAULT_PTX_CC = numba_cuda_default_ptx_cc + + +# Warn if on Linux and RTLD_GLOBAL is enabled +if sys.platform.startswith("linux") and (sys.getdlopenflags() & 0x100) != 0: + warnings.warn( + "RTLD_GLOBAL is enabled, which might result in symbol resolution " + "conflicts when importing both numba and numba.cuda. Consider using " + "sys.setdlopenflags() to disable RTLD_GLOBAL " + "if you encounter symbol conflicts." + )