From bcee99605b5a4d82f33dde0e5c285f875ce5df61 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 18 Dec 2025 09:59:54 -0800
Subject: [PATCH 01/18] checkpointing 121825

---
 numba_cuda/numba/cuda/compiler.py             | 19 ++--
 numba_cuda/numba/cuda/core/base.py            |  8 +-
 numba_cuda/numba/cuda/core/callconv.py        | 86 ++++++++++++++++++-
 numba_cuda/numba/cuda/core/compiler.py        |  7 +-
 numba_cuda/numba/cuda/core/funcdesc.py        | 12 ++-
 numba_cuda/numba/cuda/core/imputils.py        |  6 +-
 numba_cuda/numba/cuda/core/typed_passes.py    |  6 ++
 numba_cuda/numba/cuda/flags.py                | 10 +++
 numba_cuda/numba/cuda/lowering.py             |  2 +-
 numba_cuda/numba/cuda/target.py               | 85 +-----------------
 .../cuda/tests/cudadrv/test_reset_device.py   |  1 +
 11 files changed, 140 insertions(+), 102 deletions(-)

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
index e38de0a20..cc592babd 100644
--- a/numba_cuda/numba/cuda/compiler.py
+++ b/numba_cuda/numba/cuda/compiler.py
@@ -29,7 +29,7 @@
 from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.flags import CUDAFlags
-from numba.cuda.target import CUDACABICallConv
+from numba.cuda.core.callconv import CUDACABICallConv, CUDACallConv
 from numba.cuda.core.compiler import CompilerBase
 from numba.cuda.core.compiler_machinery import (
     FunctionPass,
@@ -699,6 +699,7 @@ def compile_cuda(
     cc=None,
     max_registers=None,
     lto=False,
+    abi="numba",
 ):
     if cc is None:
         raise ValueError("Compute Capability must be supplied")
@@ -741,6 +742,9 @@ def compile_cuda(
     flags.max_registers = max_registers
     flags.lto = lto
 
+    if abi == "c":
+        flags.call_conv = CUDACABICallConv(targetctx)
+
     with utils.numba_target_override():
         cres = compile_extra(
             typingctx=typingctx,
@@ -1106,6 +1110,7 @@ def _compile_pyfunc_with_fixup(
 
     cc = _default_cc(cc)
 
+    wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
     cres = compile_cuda(
         pyfunc,
         return_type,
@@ -1116,6 +1121,7 @@ def _compile_pyfunc_with_fixup(
         nvvm_options=nvvm_options,
         cc=cc,
         forceinline=forceinline,
+        abi=abi,
     )
     resty = cres.signature.return_type
 
@@ -1126,11 +1132,10 @@ def _compile_pyfunc_with_fixup(
 
     if device:
         lib = cres.library
-        if abi == "c":
-            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
-            lib = cabi_wrap_function(
-                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
-            )
+        # if abi == "c":
+            # lib = cabi_wrap_function(
+            #     tgt, lib, cres.fndesc, wrapper_name, nvvm_options
+            # )
     else:
         lib = cres.library
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
@@ -1376,7 +1381,7 @@ def declare_device_function(name, restype, argtypes, link, use_cooperative):
 
     # ExternalFunctionDescriptor provides a lowering implementation for calling
     # external functions
-    fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
+    fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes, CUDACallConv(targetctx))
     targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
 
     return device_function_template
diff --git a/numba_cuda/numba/cuda/core/base.py b/numba_cuda/numba/cuda/core/base.py
index 241f7ace4..21b4f6d51 100644
--- a/numba_cuda/numba/cuda/core/base.py
+++ b/numba_cuda/numba/cuda/core/base.py
@@ -469,9 +469,9 @@ def get_external_function_type(self, fndesc):
         return fnty
 
     def declare_function(self, module, fndesc):
-        fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
+        fnty = fndesc.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
         fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name)
-        self.call_conv.decorate_function(
+        fndesc.call_conv.decorate_function(
             fn, fndesc.args, fndesc.argtypes, noalias=fndesc.noalias
         )
         if fndesc.inline:
@@ -975,7 +975,7 @@ def call_internal(self, builder, fndesc, sig, args):
             builder, fndesc, sig, args
         )
         with cgutils.if_unlikely(builder, status.is_error):
-            self.call_conv.return_status_propagate(builder, status)
+            fndesc.call_conv.return_status_propagate(builder, status)
 
         res = imputils.fix_returning_optional(self, builder, sig, status, res)
         return res
@@ -987,7 +987,7 @@ def call_internal_no_propagate(self, builder, fndesc, sig, args):
         # Add call to the generated function
         llvm_mod = builder.module
         fn = self.declare_function(llvm_mod, fndesc)
-        status, res = self.call_conv.call_function(
+        status, res = fndesc.call_conv.call_function(
             builder, fn, sig.return_type, sig.args, args
         )
         return status, res
diff --git a/numba_cuda/numba/cuda/core/callconv.py b/numba_cuda/numba/cuda/core/callconv.py
index 2ff231934..2c6b61730 100644
--- a/numba_cuda/numba/cuda/core/callconv.py
+++ b/numba_cuda/numba/cuda/core/callconv.py
@@ -341,6 +341,88 @@ def get_exception(self, exc_id):
             return exc, exc_args, locinfo
 
 
+class CUDACallConv(MinimalCallConv):
+    def decorate_function(self, fn, args, fe_argtypes, noalias=False):
+        """
+        Set names and attributes of function arguments.
+        """
+        assert not noalias
+        arginfo = self._get_arg_packer(fe_argtypes)
+        # Do not prefix "arg." on argument name, so that nvvm compiler
+        # can track debug info of argument more accurately
+        arginfo.assign_names(self.get_arguments(fn), args)
+        fn.args[0].name = ".ret"
+
+
+class CUDACABICallConv(BaseCallConv):
+    """
+    Calling convention aimed at matching the CUDA C/C++ ABI. The implemented
+    function signature is:
+
+        <Python return type> (<Python arguments>)
+
+    Exceptions are unsupported in this convention.
+    """
+
+    def _make_call_helper(self, builder):
+        # Call helpers are used to help report exceptions back to Python, so
+        # none is required here.
+        return None
+
+    def return_value(self, builder, retval):
+        return builder.ret(retval)
+
+    def return_user_exc(
+        self, builder, exc, exc_args=None, loc=None, func_name=None
+    ):
+        msg = "Python exceptions are unsupported in the CUDA C/C++ ABI"
+        raise NotImplementedError(msg)
+
+    def return_status_propagate(self, builder, status):
+        msg = "Return status is unsupported in the CUDA C/C++ ABI"
+        raise NotImplementedError(msg)
+
+    def get_function_type(self, restype, argtypes):
+        """
+        Get the LLVM IR Function type for *restype* and *argtypes*.
+        """
+        arginfo = self._get_arg_packer(argtypes)
+        argtypes = list(arginfo.argument_types)
+        fnty = ir.FunctionType(self.get_return_type(restype), argtypes)
+        return fnty
+
+    def decorate_function(self, fn, args, fe_argtypes, noalias=False):
+        """
+        Set names and attributes of function arguments.
+        """
+        assert not noalias
+        arginfo = self._get_arg_packer(fe_argtypes)
+        arginfo.assign_names(self.get_arguments(fn), ["arg." + a for a in args])
+
+    def get_arguments(self, func):
+        """
+        Get the Python-level arguments of LLVM *func*.
+        """
+        return func.args
+
+    def call_function(self, builder, callee, resty, argtys, args):
+        """
+        Call the Numba-compiled *callee*.
+        """
+        arginfo = self._get_arg_packer(argtys)
+        realargs = arginfo.as_arguments(builder, args)
+        code = builder.call(callee, realargs)
+        # No status required as we don't support exceptions or a distinct None
+        # value in a C ABI.
+        status = None
+        out = self.context.get_returned_value(builder, resty, code)
+        return status, out
+
+    def get_return_type(self, ty):
+        return self.context.data_model_manager[ty].get_return_type()
+
+
+
 class ErrorModel(object):
     def __init__(self, call_conv):
         self.call_conv = call_conv
@@ -385,8 +467,8 @@ class NumpyErrorModel(ErrorModel):
 }
 
 
-def create_error_model(model_name, context):
+def create_error_model(model_name, call_conv):
     """
     Create an error model instance for the given target context.
     """
-    return error_models[model_name](context.call_conv)
+    return error_models[model_name](call_conv)
diff --git a/numba_cuda/numba/cuda/core/compiler.py b/numba_cuda/numba/cuda/core/compiler.py
index 9cb8a1840..ff71f19ab 100644
--- a/numba_cuda/numba/cuda/core/compiler.py
+++ b/numba_cuda/numba/cuda/core/compiler.py
@@ -10,6 +10,8 @@
 from numba.cuda.core.untyped_passes import ExtractByteCode, FixupArgs
 from numba.cuda.core.targetconfig import ConfigStack
 
+from numba.cuda.core.callconv import CUDACallConv
+
 
 class _CompileStatus(object):
     """
@@ -67,7 +69,10 @@ def _make_subtarget(targetctx, flags):
         subtargetoptions["enable_nrt"] = True
     if flags.fastmath:
         subtargetoptions["fastmath"] = flags.fastmath
-    error_model = callconv.create_error_model(flags.error_model, targetctx)
+    
+    # FIXME: should update everywhere uses error_model to use callconv from fundesc
+    call_conv = CUDACallConv(targetctx)
+    error_model = callconv.create_error_model(flags.error_model, call_conv)
     subtargetoptions["error_model"] = error_model
 
     return targetctx.subtarget(**subtargetoptions)
diff --git a/numba_cuda/numba/cuda/core/funcdesc.py b/numba_cuda/numba/cuda/core/funcdesc.py
index bcb238d00..3ce3fead5 100644
--- a/numba_cuda/numba/cuda/core/funcdesc.py
+++ b/numba_cuda/numba/cuda/core/funcdesc.py
@@ -12,6 +12,8 @@
 from numba.cuda import itanium_mangler
 from numba.cuda.utils import _dynamic_modname, _dynamic_module
 
+from numba.cuda.core.callconv import CUDACallConv
+
 
 def default_mangler(name, argtypes, *, abi_tags=(), uid=None):
     return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, uid=uid)
@@ -54,6 +56,7 @@ class FunctionDescriptor(object):
         "noalias",
         "abi_tags",
         "uid",
+        "call_conv",
     )
 
     def __init__(
@@ -76,6 +79,7 @@ def __init__(
         global_dict=None,
         abi_tags=(),
         uid=None,
+        call_conv=None,
     ):
         self.native = native
         self.modname = modname
@@ -120,6 +124,7 @@ def __init__(
         self.inline = inline
         self.noalias = noalias
         self.abi_tags = abi_tags
+        self.call_conv = call_conv
 
     def lookup_globals(self):
         """
@@ -219,6 +224,7 @@ def _from_python_function(
         inline=False,
         noalias=False,
         abi_tags=(),
+        call_conv=None,
     ):
         (
             qualname,
@@ -247,6 +253,7 @@ def _from_python_function(
             global_dict=global_dict,
             abi_tags=abi_tags,
             uid=func_ir.func_id.unique_id,
+            call_conv=call_conv,
         )
         return self
 
@@ -269,6 +276,7 @@ def from_specialized_function(
         inline,
         noalias,
         abi_tags,
+        call_conv,
     ):
         """
         Build a FunctionDescriptor for a given specialization of a Python
@@ -284,6 +292,7 @@ def from_specialized_function(
             inline=inline,
             noalias=noalias,
             abi_tags=abi_tags,
+            call_conv=call_conv,
         )
 
     @classmethod
@@ -308,7 +317,7 @@ class ExternalFunctionDescriptor(FunctionDescriptor):
 
     __slots__ = ()
 
-    def __init__(self, name, restype, argtypes):
+    def __init__(self, name, restype, argtypes, call_conv):
         args = ["arg%d" % i for i in range(len(argtypes))]
 
         def mangler(a, x, abi_tags, uid=None):
@@ -327,4 +336,5 @@ def mangler(a, x, abi_tags, uid=None):
             kws=None,
             mangler=mangler,
             argtypes=argtypes,
+            call_conv=call_conv
         )
diff --git a/numba_cuda/numba/cuda/core/imputils.py b/numba_cuda/numba/cuda/core/imputils.py
index 2881c2771..31cf5ea25 100644
--- a/numba_cuda/numba/cuda/core/imputils.py
+++ b/numba_cuda/numba/cuda/core/imputils.py
@@ -215,11 +215,11 @@ def user_function(fndesc, libs):
     def imp(context, builder, sig, args):
         func = context.declare_function(builder.module, fndesc)
         # env=None assumes this is a nopython function
-        status, retval = context.call_conv.call_function(
+        status, retval = fndesc.call_conv.call_function(
             builder, func, fndesc.restype, fndesc.argtypes, args
         )
         with cgutils.if_unlikely(builder, status.is_error):
-            context.call_conv.return_status_propagate(builder, status)
+            fndesc.call_conv.return_status_propagate(builder, status)
         assert sig.return_type == fndesc.restype
         # Reconstruct optional return type
         retval = fix_returning_optional(context, builder, sig, status, retval)
@@ -243,7 +243,7 @@ def user_generator(gendesc, libs):
     def imp(context, builder, sig, args):
         func = context.declare_function(builder.module, gendesc)
         # env=None assumes this is a nopython function
-        status, retval = context.call_conv.call_function(
+        status, retval = gendesc.call_conv.call_function(
             builder, func, gendesc.restype, gendesc.argtypes, args
         )
         # Return raw status for caller to process StopIteration
diff --git a/numba_cuda/numba/cuda/core/typed_passes.py b/numba_cuda/numba/cuda/core/typed_passes.py
index 958e47c51..0dbdb7f24 100644
--- a/numba_cuda/numba/cuda/core/typed_passes.py
+++ b/numba_cuda/numba/cuda/core/typed_passes.py
@@ -31,6 +31,7 @@
     compute_cfg_from_blocks,
     is_operator_or_getitem,
 )
+from numba.cuda.core.callconv import CUDACallConv
 
 from numba.cuda.core import postproc, rewrites, funcdesc, config
 
@@ -325,6 +326,10 @@ def run_pass(self, state):
         metadata = state.metadata
         pre_stats = passmanagers.dump_refprune_stats()
 
+        call_conv = flags.call_conv
+        if call_conv is None:
+            call_conv = CUDACallConv(state.targetctx)
+
         msg = "Function %s failed at nopython mode lowering" % (
             state.func_id.func_name,
         )
@@ -340,6 +345,7 @@ def run_pass(self, state):
                     inline=flags.forceinline,
                     noalias=flags.noalias,
                     abi_tags=[flags.get_mangle_string()],
+                    call_conv=call_conv
                 )
             )
 
diff --git a/numba_cuda/numba/cuda/flags.py b/numba_cuda/numba/cuda/flags.py
index 37ff995b6..836fc385a 100644
--- a/numba_cuda/numba/cuda/flags.py
+++ b/numba_cuda/numba/cuda/flags.py
@@ -9,6 +9,7 @@
     InlineOptions,
 )
 
+from numba.cuda.core.callconv import BaseCallConv, CUDACallConv
 
 class Flags(TargetConfig):
     __slots__ = ()
@@ -160,6 +161,13 @@ def _optional_int_type(x):
         assert isinstance(x, int)
         return x
 
+def _call_conv_options_type(x):
+    if x is None:
+        return None
+
+    else:
+        assert isinstance(x, BaseCallConv)
+        return x
 
 class CUDAFlags(Flags):
     nvvm_options = Option(
@@ -176,3 +184,5 @@ class CUDAFlags(Flags):
         type=_optional_int_type, default=None, doc="Max registers"
     )
     lto = Option(type=bool, default=False, doc="Enable Link-time Optimization")
+
+    call_conv = Option(type=_call_conv_options_type, default=None, doc="")
\ No newline at end of file
diff --git a/numba_cuda/numba/cuda/lowering.py b/numba_cuda/numba/cuda/lowering.py
index c849e9ddd..1398d3690 100644
--- a/numba_cuda/numba/cuda/lowering.py
+++ b/numba_cuda/numba/cuda/lowering.py
@@ -100,7 +100,7 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
 
     @property
     def call_conv(self):
-        return self.context.call_conv
+        return self.fndesc.call_conv
 
     def init(self):
         pass
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 1f2ac98c9..565575812 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -29,7 +29,6 @@
 from numba.cuda.debuginfo import CUDADIBuilder
 from numba.cuda.flags import CUDAFlags
 from numba.cuda.models import cuda_data_manager
-from numba.cuda.core.callconv import BaseCallConv, MinimalCallConv
 from numba.cuda.core import config, targetconfig
 
 
@@ -266,9 +265,9 @@ def nonconst_module_attrs(self):
         )
         return nonconsts_with_mod
 
-    @cached_property
+    @property
     def call_conv(self):
-        return CUDACallConv(self)
+        return self.fndesc.call_conv
 
     def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
         return itanium_mangler.mangle(
@@ -423,83 +422,3 @@ def _compile_subroutine_no_cache(
             self.active_code_library.add_linking_library(cres.library)
             return cres
 
-
-class CUDACallConv(MinimalCallConv):
-    def decorate_function(self, fn, args, fe_argtypes, noalias=False):
-        """
-        Set names and attributes of function arguments.
-        """
-        assert not noalias
-        arginfo = self._get_arg_packer(fe_argtypes)
-        # Do not prefix "arg." on argument name, so that nvvm compiler
-        # can track debug info of argument more accurately
-        arginfo.assign_names(self.get_arguments(fn), args)
-        fn.args[0].name = ".ret"
-
-
-class CUDACABICallConv(BaseCallConv):
-    """
-    Calling convention aimed at matching the CUDA C/C++ ABI. The implemented
-    function signature is:
-
-        <Python return type> (<Python arguments>)
-
-    Exceptions are unsupported in this convention.
-    """
-
-    def _make_call_helper(self, builder):
-        # Call helpers are used to help report exceptions back to Python, so
-        # none is required here.
-        return None
-
-    def return_value(self, builder, retval):
-        return builder.ret(retval)
-
-    def return_user_exc(
-        self, builder, exc, exc_args=None, loc=None, func_name=None
-    ):
-        msg = "Python exceptions are unsupported in the CUDA C/C++ ABI"
-        raise NotImplementedError(msg)
-
-    def return_status_propagate(self, builder, status):
-        msg = "Return status is unsupported in the CUDA C/C++ ABI"
-        raise NotImplementedError(msg)
-
-    def get_function_type(self, restype, argtypes):
-        """
-        Get the LLVM IR Function type for *restype* and *argtypes*.
-        """
-        arginfo = self._get_arg_packer(argtypes)
-        argtypes = list(arginfo.argument_types)
-        fnty = ir.FunctionType(self.get_return_type(restype), argtypes)
-        return fnty
-
-    def decorate_function(self, fn, args, fe_argtypes, noalias=False):
-        """
-        Set names and attributes of function arguments.
-        """
-        assert not noalias
-        arginfo = self._get_arg_packer(fe_argtypes)
-        arginfo.assign_names(self.get_arguments(fn), ["arg." + a for a in args])
-
-    def get_arguments(self, func):
-        """
-        Get the Python-level arguments of LLVM *func*.
-        """
-        return func.args
-
-    def call_function(self, builder, callee, resty, argtys, args):
-        """
-        Call the Numba-compiled *callee*.
-        """
-        arginfo = self._get_arg_packer(argtys)
-        realargs = arginfo.as_arguments(builder, args)
-        code = builder.call(callee, realargs)
-        # No status required as we don't support exceptions or a distinct None
-        # value in a C ABI.
-        status = None
-        out = self.context.get_returned_value(builder, resty, code)
-        return status, out
-
-    def get_return_type(self, ty):
-        return self.context.data_model_manager[ty].get_return_type()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
index e219aa7c4..05b758854 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
@@ -9,6 +9,7 @@
 
 
 class TestResetDevice(CUDATestCase):
+    @unittest.skip
     def test_reset_device(self):
         def newthread(exception_queue):
             try:

From a988ca3766573e34506415cbfb0e6c83da9e0942 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 6 Jan 2026 08:22:54 -0800
Subject: [PATCH 02/18] checkpointing 010626

---
 numba_cuda/numba/cuda/compiler.py                  | 14 ++++++++------
 numba_cuda/numba/cuda/core/base.py                 |  8 ++++----
 numba_cuda/numba/cuda/core/callconv.py             |  1 -
 numba_cuda/numba/cuda/core/compiler.py             |  2 +-
 numba_cuda/numba/cuda/core/funcdesc.py             |  4 +---
 numba_cuda/numba/cuda/core/imputils.py             |  6 +++---
 numba_cuda/numba/cuda/core/typed_passes.py         | 13 ++++++++++---
 numba_cuda/numba/cuda/flags.py                     |  7 +++++--
 numba_cuda/numba/cuda/lowering.py                  |  2 +-
 numba_cuda/numba/cuda/target.py                    |  8 +++++++-
 .../numba/cuda/tests/cudapy/test_multithreads.py   |  1 +
 11 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
index cc592babd..b87be84f2 100644
--- a/numba_cuda/numba/cuda/compiler.py
+++ b/numba_cuda/numba/cuda/compiler.py
@@ -936,7 +936,7 @@ def define_error_gv(postfix):
     # Implement status check / exception store logic
 
     status_code = helper_func.args[0]
-    call_conv = cuda_target.target_context.call_conv
+    call_conv = CUDACallConv(cuda_target.target_context)
     status = call_conv._get_return_status(builder, status_code)
 
     # Check error status
@@ -1110,7 +1110,6 @@ def _compile_pyfunc_with_fixup(
 
     cc = _default_cc(cc)
 
-    wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
     cres = compile_cuda(
         pyfunc,
         return_type,
@@ -1132,10 +1131,11 @@ def _compile_pyfunc_with_fixup(
 
     if device:
         lib = cres.library
+        # wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
         # if abi == "c":
-            # lib = cabi_wrap_function(
-            #     tgt, lib, cres.fndesc, wrapper_name, nvvm_options
-            # )
+        #     lib = cabi_wrap_function(
+        #         tgt, lib, cres.fndesc, wrapper_name, nvvm_options
+        #     )
     else:
         lib = cres.library
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
@@ -1381,7 +1381,9 @@ def declare_device_function(name, restype, argtypes, link, use_cooperative):
 
     # ExternalFunctionDescriptor provides a lowering implementation for calling
     # external functions
-    fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes, CUDACallConv(targetctx))
+    fndesc = funcdesc.ExternalFunctionDescriptor(
+        name, restype, argtypes, CUDACallConv(targetctx)
+    )
     targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
 
     return device_function_template
diff --git a/numba_cuda/numba/cuda/core/base.py b/numba_cuda/numba/cuda/core/base.py
index 21b4f6d51..241f7ace4 100644
--- a/numba_cuda/numba/cuda/core/base.py
+++ b/numba_cuda/numba/cuda/core/base.py
@@ -469,9 +469,9 @@ def get_external_function_type(self, fndesc):
         return fnty
 
     def declare_function(self, module, fndesc):
-        fnty = fndesc.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
+        fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
         fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name)
-        fndesc.call_conv.decorate_function(
+        self.call_conv.decorate_function(
             fn, fndesc.args, fndesc.argtypes, noalias=fndesc.noalias
         )
         if fndesc.inline:
@@ -975,7 +975,7 @@ def call_internal(self, builder, fndesc, sig, args):
             builder, fndesc, sig, args
         )
         with cgutils.if_unlikely(builder, status.is_error):
-            fndesc.call_conv.return_status_propagate(builder, status)
+            self.call_conv.return_status_propagate(builder, status)
 
         res = imputils.fix_returning_optional(self, builder, sig, status, res)
         return res
@@ -987,7 +987,7 @@ def call_internal_no_propagate(self, builder, fndesc, sig, args):
         # Add call to the generated function
         llvm_mod = builder.module
         fn = self.declare_function(llvm_mod, fndesc)
-        status, res = fndesc.call_conv.call_function(
+        status, res = self.call_conv.call_function(
             builder, fn, sig.return_type, sig.args, args
         )
         return status, res
diff --git a/numba_cuda/numba/cuda/core/callconv.py b/numba_cuda/numba/cuda/core/callconv.py
index 2c6b61730..7cffbbf73 100644
--- a/numba_cuda/numba/cuda/core/callconv.py
+++ b/numba_cuda/numba/cuda/core/callconv.py
@@ -422,7 +422,6 @@ def get_return_type(self, ty):
         return self.context.data_model_manager[ty].get_return_type()
 
 
-
 class ErrorModel(object):
     def __init__(self, call_conv):
         self.call_conv = call_conv
diff --git a/numba_cuda/numba/cuda/core/compiler.py b/numba_cuda/numba/cuda/core/compiler.py
index ff71f19ab..19bfcb628 100644
--- a/numba_cuda/numba/cuda/core/compiler.py
+++ b/numba_cuda/numba/cuda/core/compiler.py
@@ -69,7 +69,7 @@ def _make_subtarget(targetctx, flags):
         subtargetoptions["enable_nrt"] = True
     if flags.fastmath:
         subtargetoptions["fastmath"] = flags.fastmath
-    
+
     # FIXME: should update everywhere uses error_model to use callconv from fundesc
     call_conv = CUDACallConv(targetctx)
     error_model = callconv.create_error_model(flags.error_model, call_conv)
diff --git a/numba_cuda/numba/cuda/core/funcdesc.py b/numba_cuda/numba/cuda/core/funcdesc.py
index 3ce3fead5..d90a83947 100644
--- a/numba_cuda/numba/cuda/core/funcdesc.py
+++ b/numba_cuda/numba/cuda/core/funcdesc.py
@@ -12,8 +12,6 @@
 from numba.cuda import itanium_mangler
 from numba.cuda.utils import _dynamic_modname, _dynamic_module
 
-from numba.cuda.core.callconv import CUDACallConv
-
 
 def default_mangler(name, argtypes, *, abi_tags=(), uid=None):
     return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, uid=uid)
@@ -336,5 +334,5 @@ def mangler(a, x, abi_tags, uid=None):
             kws=None,
             mangler=mangler,
             argtypes=argtypes,
-            call_conv=call_conv
+            call_conv=call_conv,
         )
diff --git a/numba_cuda/numba/cuda/core/imputils.py b/numba_cuda/numba/cuda/core/imputils.py
index 31cf5ea25..2881c2771 100644
--- a/numba_cuda/numba/cuda/core/imputils.py
+++ b/numba_cuda/numba/cuda/core/imputils.py
@@ -215,11 +215,11 @@ def user_function(fndesc, libs):
     def imp(context, builder, sig, args):
         func = context.declare_function(builder.module, fndesc)
         # env=None assumes this is a nopython function
-        status, retval = fndesc.call_conv.call_function(
+        status, retval = context.call_conv.call_function(
             builder, func, fndesc.restype, fndesc.argtypes, args
         )
         with cgutils.if_unlikely(builder, status.is_error):
-            fndesc.call_conv.return_status_propagate(builder, status)
+            context.call_conv.return_status_propagate(builder, status)
         assert sig.return_type == fndesc.restype
         # Reconstruct optional return type
         retval = fix_returning_optional(context, builder, sig, status, retval)
@@ -243,7 +243,7 @@ def user_generator(gendesc, libs):
     def imp(context, builder, sig, args):
         func = context.declare_function(builder.module, gendesc)
         # env=None assumes this is a nopython function
-        status, retval = gendesc.call_conv.call_function(
+        status, retval = context.call_conv.call_function(
             builder, func, gendesc.restype, gendesc.argtypes, args
         )
         # Return raw status for caller to process StopIteration
diff --git a/numba_cuda/numba/cuda/core/typed_passes.py b/numba_cuda/numba/cuda/core/typed_passes.py
index 0dbdb7f24..28c44925b 100644
--- a/numba_cuda/numba/cuda/core/typed_passes.py
+++ b/numba_cuda/numba/cuda/core/typed_passes.py
@@ -31,7 +31,7 @@
     compute_cfg_from_blocks,
     is_operator_or_getitem,
 )
-from numba.cuda.core.callconv import CUDACallConv
+from numba.cuda.core.callconv import CUDACABICallConv, CUDACallConv
 
 from numba.cuda.core import postproc, rewrites, funcdesc, config
 
@@ -330,6 +330,13 @@ def run_pass(self, state):
         if call_conv is None:
             call_conv = CUDACallConv(state.targetctx)
 
+        if isinstance(call_conv, CUDACABICallConv):
+            mangler = targetctx.c_abi_mangler
+        else:
+            mangler = targetctx.mangler
+
+        # mangler = targetctx.mangler
+
         msg = "Function %s failed at nopython mode lowering" % (
             state.func_id.func_name,
         )
@@ -341,11 +348,11 @@ def run_pass(self, state):
                     typemap,
                     restype,
                     calltypes,
-                    mangler=targetctx.mangler,
+                    mangler=mangler,
                     inline=flags.forceinline,
                     noalias=flags.noalias,
                     abi_tags=[flags.get_mangle_string()],
-                    call_conv=call_conv
+                    call_conv=call_conv,
                 )
             )
 
diff --git a/numba_cuda/numba/cuda/flags.py b/numba_cuda/numba/cuda/flags.py
index 836fc385a..cd265c73a 100644
--- a/numba_cuda/numba/cuda/flags.py
+++ b/numba_cuda/numba/cuda/flags.py
@@ -9,7 +9,8 @@
     InlineOptions,
 )
 
-from numba.cuda.core.callconv import BaseCallConv, CUDACallConv
+from numba.cuda.core.callconv import BaseCallConv
+
 
 class Flags(TargetConfig):
     __slots__ = ()
@@ -161,6 +162,7 @@ def _optional_int_type(x):
         assert isinstance(x, int)
         return x
 
+
 def _call_conv_options_type(x):
     if x is None:
         return None
@@ -169,6 +171,7 @@ def _call_conv_options_type(x):
         assert isinstance(x, BaseCallConv)
         return x
 
+
 class CUDAFlags(Flags):
     nvvm_options = Option(
         type=_nvvm_options_type,
@@ -185,4 +188,4 @@ class CUDAFlags(Flags):
     )
     lto = Option(type=bool, default=False, doc="Enable Link-time Optimization")
 
-    call_conv = Option(type=_call_conv_options_type, default=None, doc="")
\ No newline at end of file
+    call_conv = Option(type=_call_conv_options_type, default=None, doc="")
diff --git a/numba_cuda/numba/cuda/lowering.py b/numba_cuda/numba/cuda/lowering.py
index 1398d3690..c849e9ddd 100644
--- a/numba_cuda/numba/cuda/lowering.py
+++ b/numba_cuda/numba/cuda/lowering.py
@@ -100,7 +100,7 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
 
     @property
     def call_conv(self):
-        return self.fndesc.call_conv
+        return self.context.call_conv
 
     def init(self):
         pass
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 565575812..d8ee64adf 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -274,6 +274,13 @@ def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
             name, argtypes, abi_tags=abi_tags, uid=uid
         )
 
+    def c_abi_mangler(self, name, argtypes, *, abi_tags=None, uid=None):
+        if name.startswith(".NumbaEnv."):
+            # return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, uid=uid)
+            func_name = name.split(".")[-1]
+            return f"_ZN08NumbaEnv{func_name}"
+        return name.split(".")[-1]
+
     def make_constant_array(self, builder, aryty, arr):
         """
         Unlike the parent version.  This returns a a pointer in the constant
@@ -421,4 +428,3 @@ def _compile_subroutine_no_cache(
             # Allow inlining the function inside callers
             self.active_code_library.add_linking_library(cres.library)
             return cres
-
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
index d432d2939..85ac77723 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
@@ -37,6 +37,7 @@ def use_foo(x):
 
 @skip_under_cuda_memcheck("Hangs cuda-memcheck")
 @skip_on_cudasim("disabled for cudasim")
+@unittest.skip
 class TestMultiThreadCompiling(CUDATestCase):
     def test_concurrent_compiling(self):
         check_concurrent_compiling()

From 1419998ebb2a1f7b58ebaf47bfcb01e5dec2d935 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 13 Jan 2026 09:52:06 -0800
Subject: [PATCH 03/18] initial

---
 numba_cuda/numba/cuda/compiler.py             |  25 ++--
 numba_cuda/numba/cuda/core/base.py            |   6 +-
 numba_cuda/numba/cuda/core/funcdesc.py        |  24 +++-
 numba_cuda/numba/cuda/core/imputils.py        |  11 +-
 numba_cuda/numba/cuda/core/typed_passes.py    |   1 +
 numba_cuda/numba/cuda/decorators.py           |   7 +-
 numba_cuda/numba/cuda/flags.py                |  11 ++
 .../numba/cuda/tests/cudapy/test_compiler.py  |   1 -
 .../cuda/tests/cudapy/test_device_func.py     | 107 ++++++++++++++++++
 9 files changed, 170 insertions(+), 23 deletions(-)

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
index b87be84f2..e9b47a94e 100644
--- a/numba_cuda/numba/cuda/compiler.py
+++ b/numba_cuda/numba/cuda/compiler.py
@@ -700,6 +700,7 @@ def compile_cuda(
     max_registers=None,
     lto=False,
     abi="numba",
+    abi_info=None,
 ):
     if cc is None:
         raise ValueError("Compute Capability must be supplied")
@@ -745,6 +746,9 @@ def compile_cuda(
     if abi == "c":
         flags.call_conv = CUDACABICallConv(targetctx)
 
+    if abi_info is not None:
+        flags.abi_info = abi_info
+
     with utils.numba_target_override():
         cres = compile_extra(
             typingctx=typingctx,
@@ -1121,21 +1125,15 @@ def _compile_pyfunc_with_fixup(
         cc=cc,
         forceinline=forceinline,
         abi=abi,
+        abi_info=abi_info,
     )
     resty = cres.signature.return_type
 
     if resty and not device and resty != types.void:
         raise TypeError("CUDA kernel must have void return type.")
 
-    tgt = cres.target_context
-
     if device:
         lib = cres.library
-        # wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
-        # if abi == "c":
-        #     lib = cabi_wrap_function(
-        #         tgt, lib, cres.fndesc, wrapper_name, nvvm_options
-        #     )
     else:
         lib = cres.library
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
@@ -1358,7 +1356,9 @@ def compile_ptx_for_current_device(
     )
 
 
-def declare_device_function(name, restype, argtypes, link, use_cooperative):
+def declare_device_function(
+    name, restype, argtypes, link, use_cooperative, abi
+):
     from .descriptor import cuda_target
 
     typingctx = cuda_target.typing_context
@@ -1379,10 +1379,17 @@ def declare_device_function(name, restype, argtypes, link, use_cooperative):
         lib.add_linking_file(file)
     lib.use_cooperative = use_cooperative
 
+    if abi == "numba":
+        call_conv = CUDACallConv(targetctx)
+    elif abi == "c":
+        call_conv = CUDACABICallConv(targetctx)
+    else:
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
+
     # ExternalFunctionDescriptor provides a lowering implementation for calling
     # external functions
     fndesc = funcdesc.ExternalFunctionDescriptor(
-        name, restype, argtypes, CUDACallConv(targetctx)
+        name, restype, argtypes, call_conv
     )
     targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
 
diff --git a/numba_cuda/numba/cuda/core/base.py b/numba_cuda/numba/cuda/core/base.py
index 241f7ace4..38e15237c 100644
--- a/numba_cuda/numba/cuda/core/base.py
+++ b/numba_cuda/numba/cuda/core/base.py
@@ -469,9 +469,11 @@ def get_external_function_type(self, fndesc):
         return fnty
 
     def declare_function(self, module, fndesc):
-        fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
+        fnty = fndesc.call_conv.get_function_type(
+            fndesc.restype, fndesc.argtypes
+        )
         fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name)
-        self.call_conv.decorate_function(
+        fndesc.call_conv.decorate_function(
             fn, fndesc.args, fndesc.argtypes, noalias=fndesc.noalias
         )
         if fndesc.inline:
diff --git a/numba_cuda/numba/cuda/core/funcdesc.py b/numba_cuda/numba/cuda/core/funcdesc.py
index d90a83947..db6bc7ba1 100644
--- a/numba_cuda/numba/cuda/core/funcdesc.py
+++ b/numba_cuda/numba/cuda/core/funcdesc.py
@@ -55,6 +55,7 @@ class FunctionDescriptor(object):
         "abi_tags",
         "uid",
         "call_conv",
+        "abi_info",
     )
 
     def __init__(
@@ -78,6 +79,7 @@ def __init__(
         abi_tags=(),
         uid=None,
         call_conv=None,
+        abi_info=None,
     ):
         self.native = native
         self.modname = modname
@@ -105,12 +107,17 @@ def __init__(
         # be chosen at link time.
         qualprefix = qualifying_prefix(self.modname, self.qualname)
         self.uid = uid
-        self.mangled_name = mangler(
-            qualprefix,
-            self.argtypes,
-            abi_tags=abi_tags,
-            uid=uid,
-        )
+
+        if abi_info is not None and "abi_name" in abi_info:
+            self.mangled_name = abi_info["abi_name"]
+        else:
+            self.mangled_name = mangler(
+                qualprefix,
+                self.argtypes,
+                abi_tags=abi_tags,
+                uid=uid,
+            )
+
         if env_name is None:
             env_name = mangler(
                 ".NumbaEnv.{}".format(qualprefix),
@@ -123,6 +130,7 @@ def __init__(
         self.noalias = noalias
         self.abi_tags = abi_tags
         self.call_conv = call_conv
+        self.abi_info = abi_info
 
     def lookup_globals(self):
         """
@@ -223,6 +231,7 @@ def _from_python_function(
         noalias=False,
         abi_tags=(),
         call_conv=None,
+        abi_info=None,
     ):
         (
             qualname,
@@ -252,6 +261,7 @@ def _from_python_function(
             abi_tags=abi_tags,
             uid=func_ir.func_id.unique_id,
             call_conv=call_conv,
+            abi_info=abi_info,
         )
         return self
 
@@ -275,6 +285,7 @@ def from_specialized_function(
         noalias,
         abi_tags,
         call_conv,
+        abi_info,
     ):
         """
         Build a FunctionDescriptor for a given specialization of a Python
@@ -291,6 +302,7 @@ def from_specialized_function(
             noalias=noalias,
             abi_tags=abi_tags,
             call_conv=call_conv,
+            abi_info=abi_info,
         )
 
     @classmethod
diff --git a/numba_cuda/numba/cuda/core/imputils.py b/numba_cuda/numba/cuda/core/imputils.py
index 2881c2771..30e19a264 100644
--- a/numba_cuda/numba/cuda/core/imputils.py
+++ b/numba_cuda/numba/cuda/core/imputils.py
@@ -215,13 +215,18 @@ def user_function(fndesc, libs):
     def imp(context, builder, sig, args):
         func = context.declare_function(builder.module, fndesc)
         # env=None assumes this is a nopython function
-        status, retval = context.call_conv.call_function(
+        status, retval = fndesc.call_conv.call_function(
             builder, func, fndesc.restype, fndesc.argtypes, args
         )
-        with cgutils.if_unlikely(builder, status.is_error):
-            context.call_conv.return_status_propagate(builder, status)
+
+        if status is not None:
+            with cgutils.if_unlikely(builder, status.is_error):
+                context.call_conv.return_status_propagate(builder, status)
+
         assert sig.return_type == fndesc.restype
+
         # Reconstruct optional return type
+        # XXX: What do we do now when we use CABI?
         retval = fix_returning_optional(context, builder, sig, status, retval)
         # If the data representations don't match up
         if retval.type != context.get_value_type(sig.return_type):
diff --git a/numba_cuda/numba/cuda/core/typed_passes.py b/numba_cuda/numba/cuda/core/typed_passes.py
index 28c44925b..bcab3d889 100644
--- a/numba_cuda/numba/cuda/core/typed_passes.py
+++ b/numba_cuda/numba/cuda/core/typed_passes.py
@@ -353,6 +353,7 @@ def run_pass(self, state):
                     noalias=flags.noalias,
                     abi_tags=[flags.get_mangle_string()],
                     call_conv=call_conv,
+                    abi_info=flags.abi_info,
                 )
             )
 
diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py
index 8d8b90817..f63a8f68b 100644
--- a/numba_cuda/numba/cuda/decorators.py
+++ b/numba_cuda/numba/cuda/decorators.py
@@ -265,7 +265,7 @@ def autojitwrapper(func):
                 return disp
 
 
-def declare_device(name, sig, link=None, use_cooperative=False):
+def declare_device(name, sig, link=None, use_cooperative=False, abi="numba"):
     """
     Declare the signature of a foreign function. Returns a descriptor that can
     be used to call the function from a Python kernel.
@@ -276,6 +276,9 @@ def declare_device(name, sig, link=None, use_cooperative=False):
     :param link: External code to link when calling the function.
     :param use_cooperative: External code requires cooperative launch.
     """
+    if abi not in ("numba", "c"):
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
+
     if link is None:
         link = tuple()
     else:
@@ -288,7 +291,7 @@ def declare_device(name, sig, link=None, use_cooperative=False):
         raise TypeError(msg)
 
     template = declare_device_function(
-        name, restype, argtypes, link, use_cooperative
+        name, restype, argtypes, link, use_cooperative, abi
     )
 
     return template.key
diff --git a/numba_cuda/numba/cuda/flags.py b/numba_cuda/numba/cuda/flags.py
index cd265c73a..51d9eb658 100644
--- a/numba_cuda/numba/cuda/flags.py
+++ b/numba_cuda/numba/cuda/flags.py
@@ -172,6 +172,15 @@ def _call_conv_options_type(x):
         return x
 
 
+def _abi_info_options_type(x):
+    if x is None:
+        return {}
+
+    else:
+        assert isinstance(x, dict)
+        return x
+
+
 class CUDAFlags(Flags):
     nvvm_options = Option(
         type=_nvvm_options_type,
@@ -189,3 +198,5 @@ class CUDAFlags(Flags):
     lto = Option(type=bool, default=False, doc="Enable Link-time Optimization")
 
     call_conv = Option(type=_call_conv_options_type, default=None, doc="")
+
+    abi_info = Option(type=_abi_info_options_type, default=None, doc="ABI info")
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
index 31c847c20..ed4b152fa 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
@@ -178,7 +178,6 @@ def check_debug_info(self, ptx):
     # in that module anyway. So this test can only be expected to fail until we
     # have a proper way of generating device functions with the C ABI without
     # requiring the hack of generating a wrapper.
-    @unittest.expectedFailure
     def test_device_function_with_debug(self):
         # See Issue #6719 - this ensures that compilation with debug succeeds
         # with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
index 3fa62728b..01a2c0b5e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
@@ -496,5 +496,112 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 5)
 
 
+# XXX: Range from 0 - 2 input arguments
+# XXX: void return type?
+times2_cabi_cu = cuda.CUSource("""
+extern "C" __device__
+int times2(int a)
+{
+  return a * 2;
+}
+""")
+
+# 0-arg, 2-arg, and void-return C-ABI device functions.
+const42_cabi_cu = cuda.CUSource("""
+extern "C" __device__
+int const42()
+{
+  return 42;
+}
+""")
+
+add2_cabi_cu = cuda.CUSource("""
+extern "C" __device__
+int add2(int a, int b)
+{
+  return a + b;
+}
+""")
+
+consume_cabi_cu = cuda.CUSource("""
+extern "C" __device__
+void consume(int a)
+{
+  // Prevent trivial removal while still having no externally-visible effect.
+  asm volatile("" : : "r"(a) : "memory");
+}
+""")
+
+
+@skip_on_cudasim("External functions unsupported in the simulator")
+class TestDeclareDeviceCABI(CUDATestCase):
+    def test_declare_device_cabi_basic(self):
+        times2 = cuda.declare_device(
+            "times2", "int32(int32)", link=times2_cabi_cu, abi="c"
+        )
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = times2(x[i])
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+        kernel[1, 32](r, x)
+        np.testing.assert_equal(r, x * 2)
+
+    def test_declare_device_cabi_zero_args(self):
+        const42 = cuda.declare_device(
+            "const42", "int32()", link=const42_cabi_cu, abi="c"
+        )
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = x[i] + const42()
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+        kernel[1, 32](r, x)
+        np.testing.assert_equal(r, x + 42)
+
+    def test_declare_device_cabi_two_args(self):
+        add2 = cuda.declare_device(
+            "add2", "int32(int32, int32)", link=add2_cabi_cu, abi="c"
+        )
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = add2(x[i], i)
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+        kernel[1, 32](r, x)
+        np.testing.assert_equal(r, x + np.arange(10, dtype=np.int32))
+
+    def test_declare_device_cabi_void_return(self):
+        consume = cuda.declare_device(
+            "consume", "void(int32)", link=consume_cabi_cu, abi="c"
+        )
+
+        @cuda.jit
+        def kernel(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                # The call itself is what we want to validate; output must remain
+                # correct, proving compilation + linking + invocation succeeded.
+                consume(x[i])
+                r[i] = x[i] * 2
+
+        x = np.arange(10, dtype=np.int32)
+        r = np.empty_like(x)
+        kernel[1, 32](r, x)
+        np.testing.assert_equal(r, x * 2)
+
+
 if __name__ == "__main__":
     unittest.main()

From e60bf0fe66b3f4c39cfcf0f0ae44c01aa05263e4 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 13 Jan 2026 10:10:04 -0800
Subject: [PATCH 04/18] remove stale skip

---
 numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py | 1 -
 numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
index 05b758854..e219aa7c4 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
@@ -9,7 +9,6 @@
 
 
 class TestResetDevice(CUDATestCase):
-    @unittest.skip
     def test_reset_device(self):
         def newthread(exception_queue):
             try:
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
index 85ac77723..d432d2939 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
@@ -37,7 +37,6 @@ def use_foo(x):
 
 @skip_under_cuda_memcheck("Hangs cuda-memcheck")
 @skip_on_cudasim("disabled for cudasim")
-@unittest.skip
 class TestMultiThreadCompiling(CUDATestCase):
     def test_concurrent_compiling(self):
         check_concurrent_compiling()

From 72f8f8e23d2cf4593ab1214fd1f51981b4176e99 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 13 Jan 2026 10:13:25 -0800
Subject: [PATCH 05/18] remove cabi_wrap_function

---
 numba_cuda/numba/cuda/compiler.py | 51 -------------------------------
 1 file changed, 51 deletions(-)

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
index e9b47a94e..64a5b9dac 100644
--- a/numba_cuda/numba/cuda/compiler.py
+++ b/numba_cuda/numba/cuda/compiler.py
@@ -767,57 +767,6 @@ def compile_cuda(
     return cres
 
 
-def cabi_wrap_function(
-    context, lib, fndesc, wrapper_function_name, nvvm_options
-):
-    """
-    Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
-
-    The C ABI wrapper will have the same name as the source Python function.
-    """
-    # The wrapper will be contained in a new library that links to the wrapped
-    # function's library
-    library = lib.codegen.create_library(
-        f"{lib.name}_function_",
-        entry_name=wrapper_function_name,
-        nvvm_options=nvvm_options,
-    )
-    library.add_linking_library(lib)
-
-    # Determine the caller (C ABI) and wrapper (Numba ABI) function types
-    argtypes = fndesc.argtypes
-    restype = fndesc.restype
-    c_call_conv = CUDACABICallConv(context)
-    wrapfnty = c_call_conv.get_function_type(restype, argtypes)
-    fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
-
-    # Create a new module and declare the callee
-    wrapper_module = context.create_module("cuda.cabi.wrapper")
-    func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
-
-    # Define the caller - populate it with a call to the callee and return
-    # its return value
-
-    wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
-    builder = ir.IRBuilder(wrapfn.append_basic_block(""))
-
-    arginfo = context.get_arg_packer(argtypes)
-    callargs = arginfo.from_arguments(builder, wrapfn.args)
-    # We get (status, return_value), but we ignore the status since we
-    # can't propagate it through the C ABI anyway
-    _, return_value = context.call_conv.call_function(
-        builder, func, restype, argtypes, callargs
-    )
-    builder.ret(return_value)
-
-    if config.DUMP_LLVM:
-        utils.dump_llvm(fndesc, wrapper_module)
-
-    library.add_ir_module(wrapper_module)
-    library.finalize()
-    return library
-
-
 def kernel_fixup(kernel, debug):
     if debug:
         exc_helper = add_exception_store_helper(kernel)

From f0f3057696c64e2a07d01abefe044aa313799bfe Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 09:34:16 -0800
Subject: [PATCH 06/18] lift  definition to pre-branching

---
 numba_cuda/numba/cuda/compiler.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
index 81cfa6f0f..f5a038586 100644
--- a/numba_cuda/numba/cuda/compiler.py
+++ b/numba_cuda/numba/cuda/compiler.py
@@ -1081,10 +1081,8 @@ def _compile_pyfunc_with_fixup(
     if resty and not device and resty != types.void:
         raise TypeError("CUDA kernel must have void return type.")
 
-    if device:
-        lib = cres.library
-    else:
-        lib = cres.library
+    lib = cres.library
+    if not device:
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
         lib._entry_name = cres.fndesc.llvm_func_name
         kernel_fixup(kernel, debug)

From f972e09be58e0f2c46c663a5c04ed564b909c1b0 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 10:11:34 -0800
Subject: [PATCH 07/18] move declare_function to fndesc

---
 numba_cuda/numba/cuda/core/base.py     | 17 +----------------
 numba_cuda/numba/cuda/core/funcdesc.py | 15 ++++++++++++++-
 numba_cuda/numba/cuda/core/imputils.py |  4 ++--
 numba_cuda/numba/cuda/lowering.py      |  2 +-
 4 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/numba_cuda/numba/cuda/core/base.py b/numba_cuda/numba/cuda/core/base.py
index 4f1e7f388..8fdce6dc3 100644
--- a/numba_cuda/numba/cuda/core/base.py
+++ b/numba_cuda/numba/cuda/core/base.py
@@ -468,21 +468,6 @@ def get_external_function_type(self, fndesc):
         fnty = llvmir.FunctionType(restype, argtypes)
         return fnty
 
-    def declare_function(self, module, fndesc):
-        fnty = fndesc.call_conv.get_function_type(
-            fndesc.restype, fndesc.argtypes
-        )
-        fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name)
-        fndesc.call_conv.decorate_function(
-            fn, fndesc.args, fndesc.argtypes, noalias=fndesc.noalias
-        )
-        if fndesc.inline:
-            fn.attributes.add("alwaysinline")
-            # alwaysinline overrides optnone
-            fn.attributes.discard("noinline")
-            fn.attributes.discard("optnone")
-        return fn
-
     def declare_external_function(self, module, fndesc):
         fnty = self.get_external_function_type(fndesc)
         fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name)
@@ -988,7 +973,7 @@ def call_internal_no_propagate(self, builder, fndesc, sig, args):
         """
         # Add call to the generated function
         llvm_mod = builder.module
-        fn = self.declare_function(llvm_mod, fndesc)
+        fn = fndesc.declare_function(llvm_mod)
         status, res = self.call_conv.call_function(
             builder, fn, sig.return_type, sig.args, args
         )
diff --git a/numba_cuda/numba/cuda/core/funcdesc.py b/numba_cuda/numba/cuda/core/funcdesc.py
index db6bc7ba1..54047945d 100644
--- a/numba_cuda/numba/cuda/core/funcdesc.py
+++ b/numba_cuda/numba/cuda/core/funcdesc.py
@@ -8,7 +8,7 @@
 from collections import defaultdict
 import importlib
 
-from numba.cuda import types
+from numba.cuda import types, cgutils
 from numba.cuda import itanium_mangler
 from numba.cuda.utils import _dynamic_modname, _dynamic_module
 
@@ -265,6 +265,19 @@ def _from_python_function(
         )
         return self
 
+    def declare_function(self, module):
+        fnty = self.call_conv.get_function_type(self.restype, self.argtypes)
+        fn = cgutils.get_or_insert_function(module, fnty, self.mangled_name)
+        self.call_conv.decorate_function(
+            fn, self.args, self.argtypes, noalias=self.noalias
+        )
+        if self.inline:
+            fn.attributes.add("alwaysinline")
+            # alwaysinline overrides optnone
+            fn.attributes.discard("noinline")
+            fn.attributes.discard("optnone")
+        return fn
+
 
 class PythonFunctionDescriptor(FunctionDescriptor):
     """
diff --git a/numba_cuda/numba/cuda/core/imputils.py b/numba_cuda/numba/cuda/core/imputils.py
index 30e19a264..90a507bad 100644
--- a/numba_cuda/numba/cuda/core/imputils.py
+++ b/numba_cuda/numba/cuda/core/imputils.py
@@ -213,7 +213,7 @@ def user_function(fndesc, libs):
     """
 
     def imp(context, builder, sig, args):
-        func = context.declare_function(builder.module, fndesc)
+        func = fndesc.declare_function(builder.module)
         # env=None assumes this is a nopython function
         status, retval = fndesc.call_conv.call_function(
             builder, func, fndesc.restype, fndesc.argtypes, args
@@ -246,7 +246,7 @@ def user_generator(gendesc, libs):
     """
 
     def imp(context, builder, sig, args):
-        func = context.declare_function(builder.module, gendesc)
+        func = gendesc.declare_function(builder.module)
         # env=None assumes this is a nopython function
         status, retval = context.call_conv.call_function(
             builder, func, gendesc.restype, gendesc.argtypes, args
diff --git a/numba_cuda/numba/cuda/lowering.py b/numba_cuda/numba/cuda/lowering.py
index 8bd6b3419..6d68fe82a 100644
--- a/numba_cuda/numba/cuda/lowering.py
+++ b/numba_cuda/numba/cuda/lowering.py
@@ -344,7 +344,7 @@ def create_cfunc_wrapper(self):
 
     def setup_function(self, fndesc):
         # Setup function
-        self.function = self.context.declare_function(self.module, fndesc)
+        self.function = fndesc.declare_function(self.module)
         if self.flags.dbg_optnone:
             attrset = self.function.attributes
             if "alwaysinline" not in attrset:

From d699f1c74bf0471d4b1b061a2bee5e877eac3f1e Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 10:12:45 -0800
Subject: [PATCH 08/18] remove CABI optional return comment - no-op

---
 numba_cuda/numba/cuda/core/imputils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/core/imputils.py b/numba_cuda/numba/cuda/core/imputils.py
index 90a507bad..2517bb17a 100644
--- a/numba_cuda/numba/cuda/core/imputils.py
+++ b/numba_cuda/numba/cuda/core/imputils.py
@@ -226,7 +226,6 @@ def imp(context, builder, sig, args):
         assert sig.return_type == fndesc.restype
 
         # Reconstruct optional return type
-        # XXX: What do we do now when we use CABI?
         retval = fix_returning_optional(context, builder, sig, status, retval)
         # If the data representations don't match up
         if retval.type != context.get_value_type(sig.return_type):

From d434e371afcea3141718beb15af381c66528b257 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 10:13:12 -0800
Subject: [PATCH 09/18] cleanups

---
 numba_cuda/numba/cuda/core/typed_passes.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/core/typed_passes.py b/numba_cuda/numba/cuda/core/typed_passes.py
index 3dbacdc1b..071fa5fa0 100644
--- a/numba_cuda/numba/cuda/core/typed_passes.py
+++ b/numba_cuda/numba/cuda/core/typed_passes.py
@@ -335,8 +335,6 @@ def run_pass(self, state):
         else:
             mangler = targetctx.mangler
 
-        # mangler = targetctx.mangler
-
         msg = "Function %s failed at nopython mode lowering" % (
             state.func_id.func_name,
         )

From 45f90f691d7188662983a1f0fc949786ced1dd05 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 10:43:16 -0800
Subject: [PATCH 10/18] move mangler to callconv

---
 numba_cuda/numba/cuda/core/callconv.py     | 13 +++++++++++++
 numba_cuda/numba/cuda/core/typed_passes.py |  7 ++-----
 numba_cuda/numba/cuda/target.py            | 14 +++-----------
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/numba_cuda/numba/cuda/core/callconv.py b/numba_cuda/numba/cuda/core/callconv.py
index 7cffbbf73..44f5e75b6 100644
--- a/numba_cuda/numba/cuda/core/callconv.py
+++ b/numba_cuda/numba/cuda/core/callconv.py
@@ -3,6 +3,7 @@
 
 from numba.cuda import types
 from numba.cuda import cgutils
+from numba.cuda import itanium_mangler
 from collections import namedtuple
 
 from llvmlite import ir
@@ -161,6 +162,11 @@ def _get_arg_packer(self, argtypes):
         """
         return self.context.get_arg_packer(argtypes)
 
+    def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
+        return itanium_mangler.mangle(
+            name, argtypes, abi_tags=abi_tags, uid=uid
+        )
+
 
 class MinimalCallConv(BaseCallConv):
     """
@@ -421,6 +427,13 @@ def call_function(self, builder, callee, resty, argtys, args):
     def get_return_type(self, ty):
         return self.context.data_model_manager[ty].get_return_type()
 
+    def mangler(self, name, argtypes, *, abi_tags=None, uid=None):
+        if name.startswith(".NumbaEnv."):
+            # return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, uid=uid)
+            func_name = name.split(".")[-1]
+            return f"_ZN08NumbaEnv{func_name}"
+        return name.split(".")[-1]
+
 
 class ErrorModel(object):
     def __init__(self, call_conv):
diff --git a/numba_cuda/numba/cuda/core/typed_passes.py b/numba_cuda/numba/cuda/core/typed_passes.py
index 071fa5fa0..d6463ca13 100644
--- a/numba_cuda/numba/cuda/core/typed_passes.py
+++ b/numba_cuda/numba/cuda/core/typed_passes.py
@@ -31,7 +31,7 @@
     compute_cfg_from_blocks,
     is_operator_or_getitem,
 )
-from numba.cuda.core.callconv import CUDACABICallConv, CUDACallConv
+from numba.cuda.core.callconv import CUDACallConv
 
 from numba.cuda.core import postproc, rewrites, funcdesc, config
 
@@ -330,10 +330,7 @@ def run_pass(self, state):
         if call_conv is None:
             call_conv = CUDACallConv(state.targetctx)
 
-        if isinstance(call_conv, CUDACABICallConv):
-            mangler = targetctx.c_abi_mangler
-        else:
-            mangler = targetctx.mangler
+        mangler = call_conv.mangler
 
         msg = "Function %s failed at nopython mode lowering" % (
             state.func_id.func_name,
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index d8ee64adf..44a7d10b2 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -269,17 +269,9 @@ def nonconst_module_attrs(self):
     def call_conv(self):
         return self.fndesc.call_conv
 
-    def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
-        return itanium_mangler.mangle(
-            name, argtypes, abi_tags=abi_tags, uid=uid
-        )
-
-    def c_abi_mangler(self, name, argtypes, *, abi_tags=None, uid=None):
-        if name.startswith(".NumbaEnv."):
-            # return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, uid=uid)
-            func_name = name.split(".")[-1]
-            return f"_ZN08NumbaEnv{func_name}"
-        return name.split(".")[-1]
+    @property
+    def mangler(self):
+        return self.fndesc.call_conv.mangler
 
     def make_constant_array(self, builder, aryty, arr):
         """

From 3475e35b631f8f5e38655b9703cefb1319b22d27 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 10:44:04 -0800
Subject: [PATCH 11/18] document abi arg

---
 numba_cuda/numba/cuda/decorators.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py
index f63a8f68b..c414152eb 100644
--- a/numba_cuda/numba/cuda/decorators.py
+++ b/numba_cuda/numba/cuda/decorators.py
@@ -275,6 +275,7 @@ def declare_device(name, sig, link=None, use_cooperative=False, abi="numba"):
     :param sig: The Numba signature of the function.
     :param link: External code to link when calling the function.
     :param use_cooperative: External code requires cooperative launch.
+    :param abi: The ABI to use for the function. "numba" for Numba ABI, "c" for C ABI.
     """
     if abi not in ("numba", "c"):
         raise NotImplementedError(f"Unsupported ABI: {abi}")

From 1f39f7bf642b3cb2f476e739efd0d50ce2d307d6 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 10:45:39 -0800
Subject: [PATCH 12/18] remove stale comments

---
 numba_cuda/numba/cuda/tests/cudapy/test_device_func.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
index 01a2c0b5e..f8fdad056 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
@@ -496,8 +496,6 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 5)
 
 
-# XXX: Range from 0 - 2 input arguments
-# XXX: void return type?
 times2_cabi_cu = cuda.CUSource("""
 extern "C" __device__
 int times2(int a)

From b93f046d6cec0d0fd9ba206f5303e9774c6fe491 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 21 Jan 2026 11:05:10 -0800
Subject: [PATCH 13/18] add cpointer argument

---
 .../cuda/tests/cudapy/test_device_func.py     | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
index f8fdad056..6698e6337 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
@@ -530,6 +530,14 @@ def kernel(r, x):
 }
 """)
 
+consume_cabi_pointer_cu = cuda.CUSource("""
+extern "C" __device__
+void consume(int *a)
+{
+  *a = 42;
+}
+""")
+
 
 @skip_on_cudasim("External functions unsupported in the simulator")
 class TestDeclareDeviceCABI(CUDATestCase):
@@ -600,6 +608,27 @@ def kernel(r, x):
         kernel[1, 32](r, x)
         np.testing.assert_equal(r, x * 2)
 
+    def test_declare_device_cabi_pointer_return(self):
+        ffi = cffi.FFI()
+        consume = cuda.declare_device(
+            "consume",
+            types.void(types.CPointer(types.int32)),
+            link=consume_cabi_pointer_cu,
+            abi="c",
+        )
+
+        @cuda.jit
+        def kernel(x):
+            i = cuda.grid(1)
+            if i < len(x):
+                ptr = ffi.from_buffer(x[i:])
+                consume(ptr)
+
+        x = np.zeros(10, dtype=np.int32)
+
+        kernel[1, 32](x)
+        np.testing.assert_equal(x, 42)
+
 
 if __name__ == "__main__":
     unittest.main()

From 684ad4f82a46078626f2a8cd94c155f4b4ad64f5 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 22 Jan 2026 10:55:26 -0800
Subject: [PATCH 14/18] remove context.call_conv property

---
 numba_cuda/numba/cuda/cgutils.py              |  8 ++---
 numba_cuda/numba/cuda/core/base.py            |  4 +--
 numba_cuda/numba/cuda/core/compiler.py        |  1 -
 numba_cuda/numba/cuda/core/imputils.py        |  8 ++---
 numba_cuda/numba/cuda/core/optional.py        |  2 +-
 numba_cuda/numba/cuda/core/pythonapi.py       |  4 +--
 numba_cuda/numba/cuda/cpython/builtins.py     |  2 +-
 numba_cuda/numba/cuda/cpython/charseq.py      |  6 ++--
 numba_cuda/numba/cuda/cpython/iterators.py    |  2 +-
 numba_cuda/numba/cuda/cpython/listobj.py      | 14 +++++---
 numba_cuda/numba/cuda/cpython/rangeobj.py     |  2 +-
 numba_cuda/numba/cuda/cpython/slicing.py      |  4 +--
 numba_cuda/numba/cuda/cpython/tupleobj.py     | 16 ++++++---
 numba_cuda/numba/cuda/lowering.py             |  2 +-
 .../cuda/memory_management/nrt_context.py     |  6 ++--
 numba_cuda/numba/cuda/misc/gdb_hook.py        |  4 +--
 numba_cuda/numba/cuda/np/arrayobj.py          | 34 ++++++++++++-------
 numba_cuda/numba/cuda/np/npyimpl.py           | 10 +++---
 .../cuda/np/polynomial/polynomial_core.py     |  4 +--
 numba_cuda/numba/cuda/target.py               |  4 ---
 20 files changed, 77 insertions(+), 60 deletions(-)

diff --git a/numba_cuda/numba/cuda/cgutils.py b/numba_cuda/numba/cuda/cgutils.py
index 9fbadfff8..aa79662a7 100644
--- a/numba_cuda/numba/cuda/cgutils.py
+++ b/numba_cuda/numba/cuda/cgutils.py
@@ -763,12 +763,12 @@ def _dbg():
     with if_unlikely(builder, out_of_bounds_upper):
         if config.FULL_TRACEBACKS:
             _dbg()
-        context.call_conv.return_user_exc(builder, IndexError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, IndexError, (msg,))
     out_of_bounds_lower = builder.icmp_signed("<", ind, ind.type(0))
     with if_unlikely(builder, out_of_bounds_lower):
         if config.FULL_TRACEBACKS:
             _dbg()
-        context.call_conv.return_user_exc(builder, IndexError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, IndexError, (msg,))
 
 
 def get_item_pointer2(
@@ -935,7 +935,7 @@ def guard_null(context, builder, value, exc_tuple):
     with builder.if_then(is_scalar_zero(builder, value), likely=False):
         exc = exc_tuple[0]
         exc_args = exc_tuple[1:] or None
-        context.call_conv.return_user_exc(builder, exc, exc_args)
+        context.fndesc.call_conv.return_user_exc(builder, exc, exc_args)
 
 
 def guard_memory_error(context, builder, pointer, msg=None):
@@ -945,7 +945,7 @@ def guard_memory_error(context, builder, pointer, msg=None):
     assert isinstance(pointer.type, ir.PointerType), pointer.type
     exc_args = (msg,) if msg else ()
     with builder.if_then(is_null(builder, pointer), likely=False):
-        context.call_conv.return_user_exc(builder, MemoryError, exc_args)
+        context.fndesc.call_conv.return_user_exc(builder, MemoryError, exc_args)
 
 
 @contextmanager
diff --git a/numba_cuda/numba/cuda/core/base.py b/numba_cuda/numba/cuda/core/base.py
index 8fdce6dc3..0afb90bae 100644
--- a/numba_cuda/numba/cuda/core/base.py
+++ b/numba_cuda/numba/cuda/core/base.py
@@ -962,7 +962,7 @@ def call_internal(self, builder, fndesc, sig, args):
             builder, fndesc, sig, args
         )
         with cgutils.if_unlikely(builder, status.is_error):
-            self.call_conv.return_status_propagate(builder, status)
+            fndesc.call_conv.return_status_propagate(builder, status)
 
         res = imputils.fix_returning_optional(self, builder, sig, status, res)
         return res
@@ -974,7 +974,7 @@ def call_internal_no_propagate(self, builder, fndesc, sig, args):
         # Add call to the generated function
         llvm_mod = builder.module
         fn = fndesc.declare_function(llvm_mod)
-        status, res = self.call_conv.call_function(
+        status, res = fndesc.call_conv.call_function(
             builder, fn, sig.return_type, sig.args, args
         )
         return status, res
diff --git a/numba_cuda/numba/cuda/core/compiler.py b/numba_cuda/numba/cuda/core/compiler.py
index 42e5f136c..0db2d4071 100644
--- a/numba_cuda/numba/cuda/core/compiler.py
+++ b/numba_cuda/numba/cuda/core/compiler.py
@@ -69,7 +69,6 @@ def _make_subtarget(targetctx, flags):
     if flags.fastmath:
         subtargetoptions["fastmath"] = flags.fastmath
 
-    # FIXME: should update everywhere uses error_model to use callconv from fundesc
     call_conv = CUDACallConv(targetctx)
     error_model = callconv.create_error_model(flags.error_model, call_conv)
     subtargetoptions["error_model"] = error_model
diff --git a/numba_cuda/numba/cuda/core/imputils.py b/numba_cuda/numba/cuda/core/imputils.py
index 2517bb17a..31c12a733 100644
--- a/numba_cuda/numba/cuda/core/imputils.py
+++ b/numba_cuda/numba/cuda/core/imputils.py
@@ -212,7 +212,7 @@ def user_function(fndesc, libs):
     A wrapper inserting code calling Numba-compiled *fndesc*.
     """
 
-    def imp(context, builder, sig, args):
+    def imp(context, builder, sig, args, fndesc=fndesc):
         func = fndesc.declare_function(builder.module)
         # env=None assumes this is a nopython function
         status, retval = fndesc.call_conv.call_function(
@@ -221,7 +221,7 @@ def imp(context, builder, sig, args):
 
         if status is not None:
             with cgutils.if_unlikely(builder, status.is_error):
-                context.call_conv.return_status_propagate(builder, status)
+                fndesc.call_conv.return_status_propagate(builder, status)
 
         assert sig.return_type == fndesc.restype
 
@@ -244,10 +244,10 @@ def user_generator(gendesc, libs):
     A wrapper inserting code calling Numba-compiled *gendesc*.
     """
 
-    def imp(context, builder, sig, args):
+    def imp(context, builder, sig, args, gendesc=gendesc):
         func = gendesc.declare_function(builder.module)
         # env=None assumes this is a nopython function
-        status, retval = context.call_conv.call_function(
+        status, retval = gendesc.call_conv.call_function(
             builder, func, gendesc.restype, gendesc.argtypes, args
         )
         # Return raw status for caller to process StopIteration
diff --git a/numba_cuda/numba/cuda/core/optional.py b/numba_cuda/numba/cuda/core/optional.py
index 8cad47935..e7880e7e0 100644
--- a/numba_cuda/numba/cuda/core/optional.py
+++ b/numba_cuda/numba/cuda/core/optional.py
@@ -124,6 +124,6 @@ def optional_to_any(context, builder, fromty, toty, val):
     validbit = cgutils.as_bool_bit(builder, optval.valid)
     with builder.if_then(builder.not_(validbit), likely=False):
         msg = "expected %s, got None" % (fromty.type,)
-        context.call_conv.return_user_exc(builder, TypeError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, TypeError, (msg,))
 
     return context.cast(builder, optval.data, fromty.type, toty)
diff --git a/numba_cuda/numba/cuda/core/pythonapi.py b/numba_cuda/numba/cuda/core/pythonapi.py
index 1e4fa5ed1..a97f45b7a 100644
--- a/numba_cuda/numba/cuda/core/pythonapi.py
+++ b/numba_cuda/numba/cuda/core/pythonapi.py
@@ -232,7 +232,7 @@ def emit_environment_sentry(
                 )
                 self.builder.ret(self.get_null_object())
             else:
-                self.context.call_conv.return_user_exc(
+                self.context.fndesc.call_conv.return_user_exc(
                     self.builder,
                     RuntimeError,
                     (f"missing Environment: {debug_msg}",),
@@ -1773,7 +1773,7 @@ def call_jit_code(self, func, sig, args):
             with has_err:
                 builder.store(status.is_error, is_error_ptr)
                 # Set error state in the Python interpreter
-                self.context.call_conv.raise_error(builder, self, status)
+                self.context.fndesc.call_conv.raise_error(builder, self, status)
             with no_err:
                 # Handle returned value
                 res = imputils.fix_returning_optional(
diff --git a/numba_cuda/numba/cuda/cpython/builtins.py b/numba_cuda/numba/cuda/cpython/builtins.py
index 8a5eeac09..71d833956 100644
--- a/numba_cuda/numba/cuda/cpython/builtins.py
+++ b/numba_cuda/numba/cuda/cpython/builtins.py
@@ -446,7 +446,7 @@ def next_impl(context, builder, sig, args):
     res = call_iternext(context, builder, iterty, iterval)
 
     with builder.if_then(builder.not_(res.is_valid()), likely=False):
-        context.call_conv.return_user_exc(builder, StopIteration, ())
+        context.fndesc.call_conv.return_user_exc(builder, StopIteration, ())
 
     return res.yielded_value()
 
diff --git a/numba_cuda/numba/cuda/cpython/charseq.py b/numba_cuda/numba/cuda/cpython/charseq.py
index d6738a6f1..b964aecc9 100644
--- a/numba_cuda/numba/cuda/cpython/charseq.py
+++ b/numba_cuda/numba/cuda/cpython/charseq.py
@@ -244,7 +244,7 @@ def unicode_to_bytes_cast(context, builder, fromty, toty, val):
     src_length = uni_str.length
 
     with builder.if_then(notkind1):
-        context.call_conv.return_user_exc(
+        context.fndesc.call_conv.return_user_exc(
             builder,
             ValueError,
             ("cannot cast higher than 8-bit unicode_type to bytes",),
@@ -315,7 +315,7 @@ def unicode_to_unicode_charseq(context, builder, fromty, toty, val):
                 in_val = builder.zext(builder.load(in_ptr), dstint_t)
                 builder.store(in_val, builder.gep(dst, [loop.index]))
         else:
-            context.call_conv.return_user_exc(
+            context.fndesc.call_conv.return_user_exc(
                 builder,
                 ValueError,
                 (
@@ -331,7 +331,7 @@ def unicode_to_unicode_charseq(context, builder, fromty, toty, val):
                 in_val = builder.zext(builder.load(in_ptr), dstint_t)
                 builder.store(in_val, builder.gep(dst, [loop.index]))
         else:
-            context.call_conv.return_user_exc(
+            context.fndesc.call_conv.return_user_exc(
                 builder,
                 ValueError,
                 (
diff --git a/numba_cuda/numba/cuda/cpython/iterators.py b/numba_cuda/numba/cuda/cpython/iterators.py
index df7a87b75..e3606070f 100644
--- a/numba_cuda/numba/cuda/cpython/iterators.py
+++ b/numba_cuda/numba/cuda/cpython/iterators.py
@@ -164,4 +164,4 @@ def iternext_zip(context, builder, sig, args, result):  # noqa: F811
         builder,
         builder.and_(status.is_error, builder.not_(status.is_stop_iteration)),
     ):
-        context.call_conv.return_status_propagate(builder, status)
+        context.fndesc.call_conv.return_status_propagate(builder, status)
diff --git a/numba_cuda/numba/cuda/cpython/listobj.py b/numba_cuda/numba/cuda/cpython/listobj.py
index ee059a8e7..b5055a840 100644
--- a/numba_cuda/numba/cuda/cpython/listobj.py
+++ b/numba_cuda/numba/cuda/cpython/listobj.py
@@ -117,7 +117,7 @@ def guard_index(self, idx, msg):
         Raise an error if the index is out of bounds.
         """
         with self._builder.if_then(self.is_out_of_bounds(idx), likely=False):
-            self._context.call_conv.return_user_exc(
+            self._context.fndesc.call_conv.return_user_exc(
                 self._builder, IndexError, (msg,)
             )
 
@@ -347,7 +347,7 @@ def allocate(cls, context, builder, list_type, nitems):
         """
         ok, self = cls.allocate_ex(context, builder, list_type, nitems)
         with builder.if_then(builder.not_(ok), likely=False):
-            context.call_conv.return_user_exc(
+            context.fndesc.call_conv.return_user_exc(
                 builder, MemoryError, ("cannot allocate list",)
             )
         return self
@@ -384,7 +384,7 @@ def _payload_realloc(new_allocated):
                 ir.Constant(intp_t, payload_size),
             )
             with builder.if_then(ovf, likely=False):
-                context.call_conv.return_user_exc(
+                context.fndesc.call_conv.return_user_exc(
                     builder, MemoryError, ("cannot resize list",)
                 )
 
@@ -654,7 +654,9 @@ def setitem_list(context, builder, sig, args):  # noqa: F811
         with otherwise:
             with builder.if_then(builder.icmp_signed("!=", size_delta, zero)):
                 msg = "cannot resize extended list slice with step != 1"
-                context.call_conv.return_user_exc(builder, ValueError, (msg,))
+                context.fndesc.call_conv.return_user_exc(
+                    builder, ValueError, (msg,)
+                )
 
             with cgutils.for_range_slice_generic(
                 builder, slice.start, slice.stop, slice.step
@@ -693,7 +695,9 @@ def delitem_list(context, builder, sig, args):
         builder.icmp_signed("!=", slice.step, one), likely=False
     ):
         msg = "unsupported del list[start:stop:step] with step != 1"
-        context.call_conv.return_user_exc(builder, NotImplementedError, (msg,))
+        context.fndesc.call_conv.return_user_exc(
+            builder, NotImplementedError, (msg,)
+        )
 
     # Compute the real stop, e.g. for dest[2:0]
     start = slice.start
diff --git a/numba_cuda/numba/cuda/cpython/rangeobj.py b/numba_cuda/numba/cuda/cpython/rangeobj.py
index 00a75cba8..99b7903d1 100644
--- a/numba_cuda/numba/cuda/cpython/rangeobj.py
+++ b/numba_cuda/numba/cuda/cpython/rangeobj.py
@@ -158,7 +158,7 @@ def from_range_state(cls, context, builder, state):
 
             with cgutils.if_unlikely(builder, zero_step):
                 # step shouldn't be zero
-                context.call_conv.return_user_exc(
+                context.fndesc.call_conv.return_user_exc(
                     builder, ValueError, ("range() arg 3 must not be zero",)
                 )
 
diff --git a/numba_cuda/numba/cuda/cpython/slicing.py b/numba_cuda/numba/cuda/cpython/slicing.py
index 0899db994..ed9ae163f 100644
--- a/numba_cuda/numba/cuda/cpython/slicing.py
+++ b/numba_cuda/numba/cuda/cpython/slicing.py
@@ -240,13 +240,13 @@ def slice_indices(context, builder, sig, args):
     sli = context.make_helper(builder, sig.args[0], args[0])
 
     with builder.if_then(cgutils.is_neg_int(builder, length), likely=False):
-        context.call_conv.return_user_exc(
+        context.fndesc.call_conv.return_user_exc(
             builder, ValueError, ("length should not be negative",)
         )
     with builder.if_then(
         cgutils.is_scalar_zero(builder, sli.step), likely=False
     ):
-        context.call_conv.return_user_exc(
+        context.fndesc.call_conv.return_user_exc(
             builder, ValueError, ("slice step cannot be zero",)
         )
 
diff --git a/numba_cuda/numba/cuda/cpython/tupleobj.py b/numba_cuda/numba/cuda/cpython/tupleobj.py
index 5292d412d..0483a3960 100644
--- a/numba_cuda/numba/cuda/cpython/tupleobj.py
+++ b/numba_cuda/numba/cuda/cpython/tupleobj.py
@@ -251,7 +251,9 @@ def getitem_typed(context, builder, sig, args):
 
         # Always branch and raise IndexError
         with builder.if_then(cgutils.true_bit):
-            context.call_conv.return_user_exc(builder, IndexError, errmsg_oob)
+            context.fndesc.call_conv.return_user_exc(
+                builder, IndexError, errmsg_oob
+            )
         # This is unreachable in runtime,
         # but it exists to not terminate the current basicblock.
         res = context.get_constant_null(sig.return_type)
@@ -264,7 +266,9 @@ def getitem_typed(context, builder, sig, args):
         switch = builder.switch(idx, bbelse)
 
         with builder.goto_block(bbelse):
-            context.call_conv.return_user_exc(builder, IndexError, errmsg_oob)
+            context.fndesc.call_conv.return_user_exc(
+                builder, IndexError, errmsg_oob
+            )
 
         lrtty = context.get_value_type(sig.return_type)
         voidptrty = context.get_value_type(types.voidptr)
@@ -345,7 +349,9 @@ def getitem_unituple(context, builder, sig, args):
 
         # Always branch and raise IndexError
         with builder.if_then(cgutils.true_bit):
-            context.call_conv.return_user_exc(builder, IndexError, errmsg_oob)
+            context.fndesc.call_conv.return_user_exc(
+                builder, IndexError, errmsg_oob
+            )
         # This is unreachable in runtime,
         # but it exists to not terminate the current basicblock.
         res = context.get_constant_null(sig.return_type)
@@ -357,7 +363,9 @@ def getitem_unituple(context, builder, sig, args):
         switch = builder.switch(idx, bbelse)
 
         with builder.goto_block(bbelse):
-            context.call_conv.return_user_exc(builder, IndexError, errmsg_oob)
+            context.fndesc.call_conv.return_user_exc(
+                builder, IndexError, errmsg_oob
+            )
 
         lrtty = context.get_value_type(tupty.dtype)
         with builder.goto_block(bbend):
diff --git a/numba_cuda/numba/cuda/lowering.py b/numba_cuda/numba/cuda/lowering.py
index 6d68fe82a..7b6afbf90 100644
--- a/numba_cuda/numba/cuda/lowering.py
+++ b/numba_cuda/numba/cuda/lowering.py
@@ -99,7 +99,7 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
 
     @property
     def call_conv(self):
-        return self.context.call_conv
+        return self.fndesc.call_conv
 
     def init(self):
         pass
diff --git a/numba_cuda/numba/cuda/memory_management/nrt_context.py b/numba_cuda/numba/cuda/memory_management/nrt_context.py
index 208412d3b..99c74c42e 100644
--- a/numba_cuda/numba/cuda/memory_management/nrt_context.py
+++ b/numba_cuda/numba/cuda/memory_management/nrt_context.py
@@ -415,7 +415,7 @@ def get_nrt_api(self, builder):
     def eh_check(self, builder):
         """Check if an exception is raised"""
         ctx = self._context
-        cc = ctx.call_conv
+        cc = ctx.fndesc.call_conv
         # Inspect the excinfo argument on the function
         trystatus = cc.check_try_status(builder)
         excinfo = trystatus.excinfo
@@ -428,11 +428,11 @@ def eh_check(self, builder):
     def eh_try(self, builder):
         """Begin a try-block."""
         ctx = self._context
-        cc = ctx.call_conv
+        cc = ctx.fndesc.call_conv
         cc.set_try_status(builder)
 
     def eh_end_try(self, builder):
         """End a try-block"""
         ctx = self._context
-        cc = ctx.call_conv
+        cc = ctx.fndesc.call_conv
         cc.unset_try_status(builder)
diff --git a/numba_cuda/numba/cuda/misc/gdb_hook.py b/numba_cuda/numba/cuda/misc/gdb_hook.py
index 2c8bd3c8d..805f952e7 100644
--- a/numba_cuda/numba/cuda/misc/gdb_hook.py
+++ b/numba_cuda/numba/cuda/misc/gdb_hook.py
@@ -162,14 +162,14 @@ def init_gdb_codegen(
     invalid_write = builder.icmp_signed(">", stat, int32_t(12))
     with builder.if_then(invalid_write, likely=False):
         msg = "Internal error: `snprintf` buffer would have overflowed."
-        cgctx.call_conv.return_user_exc(builder, RuntimeError, (msg,))
+        cgctx.fndesc.call_conv.return_user_exc(builder, RuntimeError, (msg,))
 
     # fork, check pids etc
     child_pid = builder.call(fork, tuple())
     fork_failed = builder.icmp_signed("==", child_pid, int32_t(-1))
     with builder.if_then(fork_failed, likely=False):
         msg = "Internal error: `fork` failed."
-        cgctx.call_conv.return_user_exc(builder, RuntimeError, (msg,))
+        cgctx.fndesc.call_conv.return_user_exc(builder, RuntimeError, (msg,))
 
     is_child = builder.icmp_signed("==", child_pid, zero_i32t)
     with builder.if_else(is_child) as (then, orelse):
diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py
index 66c69920c..ca2af1b77 100644
--- a/numba_cuda/numba/cuda/np/arrayobj.py
+++ b/numba_cuda/numba/cuda/np/arrayobj.py
@@ -672,7 +672,7 @@ def array_item(context, builder, sig, args):
         builder.icmp_signed("!=", nitems, nitems.type(1)), likely=False
     ):
         msg = "item(): can only convert an array of size 1 to a Python scalar"
-        context.call_conv.return_user_exc(builder, ValueError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, ValueError, (msg,))
 
     return load_item(context, builder, aryty, ary.data)
 
@@ -691,7 +691,9 @@ def array_itemset(context, builder, sig, args):
             builder.icmp_signed("!=", nitems, nitems.type(1)), likely=False
         ):
             msg = "itemset(): can only write to an array of size 1"
-            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+            context.fndesc.call_conv.return_user_exc(
+                builder, ValueError, (msg,)
+            )
 
         store_item(context, builder, aryty, val, ary.data)
         return context.get_dummy_value()
@@ -1478,7 +1480,9 @@ def _bc_adjust_dimension(context, builder, shapes, strides, target_shape):
         # Check error
         with builder.if_then(builder.not_(accepted), likely=False):
             msg = "cannot broadcast source array for assignment"
-            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+            context.fndesc.call_conv.return_user_exc(
+                builder, ValueError, (msg,)
+            )
         # Truncate extra shapes, strides
         shapes = shapes[nd_diff:]
         strides = strides[nd_diff:]
@@ -2426,7 +2430,9 @@ def array_reshape(context, builder, sig, args):
 
     with builder.if_then(fail):
         msg = "incompatible shape for array"
-        context.call_conv.return_user_exc(builder, NotImplementedError, (msg,))
+        context.fndesc.call_conv.return_user_exc(
+            builder, NotImplementedError, (msg,)
+        )
 
     ret = make_array(retty)(context, builder)
     populate_array(
@@ -3099,7 +3105,7 @@ def array_view(context, builder, sig, args):
 
     with builder.if_then(fail):
         msg = "new type not compatible with array"
-        context.call_conv.return_user_exc(builder, ValueError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, ValueError, (msg,))
 
     res = ret._getvalue()
     return impl_ret_borrowed(context, builder, sig.return_type, res)
@@ -4719,7 +4725,7 @@ def _empty_nd_impl(context, builder, arrtype, shapes):
     with builder.if_then(overflow, likely=False):
         # Raise same error as numpy, see:
         # https://github.com/numpy/numpy/blob/2a488fe76a0f732dc418d03b452caace161673da/numpy/core/src/multiarray/ctors.c#L1095-L1101    # noqa: E501
-        context.call_conv.return_user_exc(
+        context.fndesc.call_conv.return_user_exc(
             builder,
             ValueError,
             (
@@ -4803,7 +4809,7 @@ def safecast_intp(context, builder, src_t, src):
         elif src_t.width >= intp_width:
             is_larger = builder.icmp_signed(">", src, maxval)
             with builder.if_then(is_larger, likely=False):
-                context.call_conv.return_user_exc(
+                context.fndesc.call_conv.return_user_exc(
                     builder,
                     ValueError,
                     ("Cannot safely convert value to intp",),
@@ -4828,7 +4834,7 @@ def safecast_intp(context, builder, src_t, src):
     for shape in shapes:
         is_neg = builder.icmp_signed("<", shape, zero)
         with cgutils.if_unlikely(builder, is_neg):
-            context.call_conv.return_user_exc(
+            context.fndesc.call_conv.return_user_exc(
                 builder, ValueError, ("negative dimensions not allowed",)
             )
 
@@ -5758,7 +5764,9 @@ def codegen(context, builder, sig, args):
         is_incompatible = cgutils.is_not_null(builder, rem)
         with builder.if_then(is_incompatible, likely=False):
             msg = "buffer size must be a multiple of element size"
-            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+            context.fndesc.call_conv.return_user_exc(
+                builder, ValueError, (msg,)
+            )
 
         shape = cgutils.pack_array(builder, [builder.sdiv(nbytes, ll_itemsize)])
         strides = cgutils.pack_array(builder, [ll_itemsize])
@@ -6013,7 +6021,7 @@ def check_sequence_shape(context, builder, seqty, seq, shapes):
     """
 
     def _fail():
-        context.call_conv.return_user_exc(
+        context.fndesc.call_conv.return_user_exc(
             builder, ValueError, ("incompatible sequence shape",)
         )
 
@@ -6168,7 +6176,7 @@ def _normalize_axis(context, builder, func_name, ndim, axis):
     )
     with builder.if_then(axis_out_of_bounds, likely=False):
         msg = "%s(): axis out of bounds" % func_name
-        context.call_conv.return_user_exc(builder, IndexError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, IndexError, (msg,))
 
     return axis
 
@@ -6488,7 +6496,7 @@ def _np_concatenate(context, builder, arrtys, arrs, retty, axis):
                         is_ok, builder.icmp_signed("==", sh, ret_sh)
                     )
                 with builder.if_then(builder.not_(is_ok), likely=False):
-                    context.call_conv.return_user_exc(
+                    context.fndesc.call_conv.return_user_exc(
                         builder,
                         ValueError,
                         (
@@ -6537,7 +6545,7 @@ def _np_stack(context, builder, arrtys, arrs, retty, axis):
         ):
             is_ok = builder.and_(is_ok, builder.icmp_signed("==", sh, orig_sh))
             with builder.if_then(builder.not_(is_ok), likely=False):
-                context.call_conv.return_user_exc(
+                context.fndesc.call_conv.return_user_exc(
                     builder,
                     ValueError,
                     ("np.stack(): all input arrays must have the same shape",),
diff --git a/numba_cuda/numba/cuda/np/npyimpl.py b/numba_cuda/numba/cuda/np/npyimpl.py
index 472f55645..be7c0d007 100644
--- a/numba_cuda/numba/cuda/np/npyimpl.py
+++ b/numba_cuda/numba/cuda/np/npyimpl.py
@@ -507,7 +507,9 @@ def make_intp_const(val):
             if loc is not None:
                 msg += '\nFile "%s", line %d, ' % (loc.filename, loc.line)
 
-            context.call_conv.return_user_exc(builder, ValueError, (msg,))
+            context.fndesc.call_conv.return_user_exc(
+                builder, ValueError, (msg,)
+            )
 
     real_array_ty = array_ty.as_array
 
@@ -799,11 +801,11 @@ def generate(self, *args):
             for val, inty, outty in zip(args, osig.args, isig.args)
         ]
         if self.cres.objectmode:
-            func_type = self.context.call_conv.get_function_type(
+            func_type = self.context.fndesc.call_conv.get_function_type(
                 types.pyobject, [types.pyobject] * len(isig.args)
             )
         else:
-            func_type = self.context.call_conv.get_function_type(
+            func_type = self.context.fndesc.call_conv.get_function_type(
                 isig.return_type, isig.args
             )
         module = self.builder.block.function.module
@@ -812,7 +814,7 @@ def generate(self, *args):
         )
         entry_point.attributes.add("alwaysinline")
 
-        _, res = self.context.call_conv.call_function(
+        _, res = self.context.fndesc.call_conv.call_function(
             self.builder, entry_point, isig.return_type, isig.args, cast_args
         )
         return self.cast(res, isig.return_type, osig.return_type)
diff --git a/numba_cuda/numba/cuda/np/polynomial/polynomial_core.py b/numba_cuda/numba/cuda/np/polynomial/polynomial_core.py
index a8a28f8f2..e10fef477 100644
--- a/numba_cuda/numba/cuda/np/polynomial/polynomial_core.py
+++ b/numba_cuda/numba/cuda/np/polynomial/polynomial_core.py
@@ -149,12 +149,12 @@ def to_double(coef):
     pred2 = builder.icmp_signed("!=", s2, two)
 
     with cgutils.if_unlikely(builder, pred1):
-        context.call_conv.return_user_exc(
+        context.fndesc.call_conv.return_user_exc(
             builder, ValueError, ("Domain has wrong number of elements.",)
         )
 
     with cgutils.if_unlikely(builder, pred2):
-        context.call_conv.return_user_exc(
+        context.fndesc.call_conv.return_user_exc(
             builder, ValueError, ("Window has wrong number of elements.",)
         )
 
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 44a7d10b2..2622c1397 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -265,10 +265,6 @@ def nonconst_module_attrs(self):
         )
         return nonconsts_with_mod
 
-    @property
-    def call_conv(self):
-        return self.fndesc.call_conv
-
     @property
     def mangler(self):
         return self.fndesc.call_conv.mangler

From 27199d2d24afc2ff55f68446867cd4152d40cc53 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 22 Jan 2026 13:16:34 -0800
Subject: [PATCH 15/18] use callconv mangler everywhere

---
 numba_cuda/numba/cuda/core/generators.py | 2 +-
 numba_cuda/numba/cuda/cpython/numbers.py | 3 ++-
 numba_cuda/numba/cuda/lowering.py        | 2 +-
 numba_cuda/numba/cuda/np/math/numbers.py | 3 ++-
 numba_cuda/numba/cuda/target.py          | 4 ----
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/numba_cuda/numba/cuda/core/generators.py b/numba_cuda/numba/cuda/core/generators.py
index f296d6b04..5a85b4f91 100644
--- a/numba_cuda/numba/cuda/core/generators.py
+++ b/numba_cuda/numba/cuda/core/generators.py
@@ -76,7 +76,7 @@ def __init__(self, lower):
         self.geninfo = lower.generator_info
         self.gentype = self.get_generator_type()
         self.gendesc = GeneratorDescriptor.from_generator_fndesc(
-            lower.func_ir, self.fndesc, self.gentype, self.context.mangler
+            lower.func_ir, self.fndesc, self.gentype, self.call_conv.mangler
         )
         # Helps packing non-omitted arguments into a structure
         self.arg_packer = self.context.get_data_packer(self.fndesc.argtypes)
diff --git a/numba_cuda/numba/cuda/cpython/numbers.py b/numba_cuda/numba/cuda/cpython/numbers.py
index d3a24b934..4ffe3ad86 100644
--- a/numba_cuda/numba/cuda/cpython/numbers.py
+++ b/numba_cuda/numba/cuda/cpython/numbers.py
@@ -671,7 +671,8 @@ def real_divmod(context, builder, x, y):
     floatty = x.type
 
     module = builder.module
-    fname = context.mangler(".numba.python.rem", [x.type])
+    call_conv = context.fndesc.call_conv
+    fname = call_conv.mangler(".numba.python.rem", [x.type])
     fnty = ir.FunctionType(floatty, (floatty, floatty, ir.PointerType(floatty)))
     fn = cgutils.get_or_insert_function(module, fnty, fname)
 
diff --git a/numba_cuda/numba/cuda/lowering.py b/numba_cuda/numba/cuda/lowering.py
index 7b6afbf90..effa6d32c 100644
--- a/numba_cuda/numba/cuda/lowering.py
+++ b/numba_cuda/numba/cuda/lowering.py
@@ -1197,7 +1197,7 @@ def _lower_call_RecursiveCall(self, fnty, expr, signature):
             expr.kws,
         )
         rec_ov = fnty.get_overloads(signature.args)
-        mangler = self.context.mangler or default_mangler
+        mangler = self.call_conv.mangler or default_mangler
         abi_tags = self.fndesc.abi_tags
         mangled_name = mangler(
             rec_ov.qualname, signature.args, abi_tags=abi_tags, uid=rec_ov.uid
diff --git a/numba_cuda/numba/cuda/np/math/numbers.py b/numba_cuda/numba/cuda/np/math/numbers.py
index cf0875b9c..b4b20cd49 100644
--- a/numba_cuda/numba/cuda/np/math/numbers.py
+++ b/numba_cuda/numba/cuda/np/math/numbers.py
@@ -657,7 +657,8 @@ def real_divmod(context, builder, x, y):
     floatty = x.type
 
     module = builder.module
-    fname = context.mangler(".numba.python.rem", [x.type])
+    call_conv = context.fndesc.call_conv
+    fname = call_conv.mangler(".numba.python.rem", [x.type])
     fnty = ir.FunctionType(floatty, (floatty, floatty, ir.PointerType(floatty)))
     fn = cgutils.get_or_insert_function(module, fnty, fname)
 
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 2622c1397..4141ecf7f 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -265,10 +265,6 @@ def nonconst_module_attrs(self):
         )
         return nonconsts_with_mod
 
-    @property
-    def mangler(self):
-        return self.fndesc.call_conv.mangler
-
     def make_constant_array(self, builder, aryty, arr):
         """
         Unlike the parent version.  This returns a a pointer in the constant

From 864a40cb31d49d804d2dc54f3f9c9a0b0046706c Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 22 Jan 2026 13:18:57 -0800
Subject: [PATCH 16/18] cleanups

---
 numba_cuda/numba/cuda/core/callconv.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/core/callconv.py b/numba_cuda/numba/cuda/core/callconv.py
index 44f5e75b6..a9489a892 100644
--- a/numba_cuda/numba/cuda/core/callconv.py
+++ b/numba_cuda/numba/cuda/core/callconv.py
@@ -429,7 +429,6 @@ def get_return_type(self, ty):
 
     def mangler(self, name, argtypes, *, abi_tags=None, uid=None):
         if name.startswith(".NumbaEnv."):
-            # return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, uid=uid)
             func_name = name.split(".")[-1]
             return f"_ZN08NumbaEnv{func_name}"
         return name.split(".")[-1]

From 15dcbe506ad1d2b0258534d676aa0beef98fdea5 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 26 Jan 2026 09:53:27 -0800
Subject: [PATCH 17/18] address review comments

---
 numba_cuda/numba/cuda/core/compiler.py   | 2 ++
 numba_cuda/numba/cuda/core/generators.py | 5 ++++-
 numba_cuda/numba/cuda/target.py          | 9 +++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/core/compiler.py b/numba_cuda/numba/cuda/core/compiler.py
index 0db2d4071..af8c26ee3 100644
--- a/numba_cuda/numba/cuda/core/compiler.py
+++ b/numba_cuda/numba/cuda/core/compiler.py
@@ -69,6 +69,8 @@ def _make_subtarget(targetctx, flags):
     if flags.fastmath:
         subtargetoptions["fastmath"] = flags.fastmath
 
+    # Only the CUDA Calling convention can raise exceptions, so we assume here
+    # that it is suitable for creating the error model.
     call_conv = CUDACallConv(targetctx)
     error_model = callconv.create_error_model(flags.error_model, call_conv)
     subtargetoptions["error_model"] = error_model
diff --git a/numba_cuda/numba/cuda/core/generators.py b/numba_cuda/numba/cuda/core/generators.py
index 5a85b4f91..96f22ad7d 100644
--- a/numba_cuda/numba/cuda/core/generators.py
+++ b/numba_cuda/numba/cuda/core/generators.py
@@ -76,7 +76,10 @@ def __init__(self, lower):
         self.geninfo = lower.generator_info
         self.gentype = self.get_generator_type()
         self.gendesc = GeneratorDescriptor.from_generator_fndesc(
-            lower.func_ir, self.fndesc, self.gentype, self.call_conv.mangler
+            lower.func_ir,
+            self.fndesc,
+            self.gentype,
+            self.fndesc.call_conv.mangler,
         )
         # Helps packing non-omitted arguments into a structure
         self.arg_packer = self.context.get_data_packer(self.fndesc.argtypes)
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 4141ecf7f..3909ac003 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -265,6 +265,15 @@ def nonconst_module_attrs(self):
         )
         return nonconsts_with_mod
 
+    @property
+    def call_conv(self):
+        warnings.warn(
+            "Context.call_conv is deprecated. "
+            "Use FunctionDescriptor.call_conv instead.",
+            DeprecationWarning,
+        )
+        return self.fndesc.call_conv
+
     def make_constant_array(self, builder, aryty, arr):
         """
         Unlike the parent version.  This returns a a pointer in the constant

From b46435a7df5e05aad816de5df554d68e3cbad996 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 3 Feb 2026 11:37:20 -0800
Subject: [PATCH 18/18] ignore call_conv deprecation warning

---
 ci/test_thirdparty_awkward.sh | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/ci/test_thirdparty_awkward.sh b/ci/test_thirdparty_awkward.sh
index 6349ac1c2..b1ba442f3 100755
--- a/ci/test_thirdparty_awkward.sh
+++ b/ci/test_thirdparty_awkward.sh
@@ -74,6 +74,21 @@ index 39080a34..0eb3940f 100644
      array = rng.integers(50, size=1000)
 EOF
 
+patch -p1 <<'EOF'
+diff --git a/pyproject.toml b/pyproject.toml
+index 78ecfba9..80a25474 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -136,6 +136,7 @@ filterwarnings = [
+     "ignore:.*np\\.MachAr.*:DeprecationWarning",
+     "ignore:module 'sre_.*' is deprecated:DeprecationWarning",
+     "ignore:Jitify is performing a one-time only warm-up",
++    "ignore:Context.call_conv is deprecated.",
+ ]
+ log_cli_level = "INFO"
+ testpaths = ["tests", "tests-cuda", "tests-cuda-kernels", "tests-cuda-kernels-explicit"]
+EOF
+
 rapids-logger "Generate awkward tests"
 nox -s prepare -- --tests