NVIDIA · gmarkall · Feb 3, 2026 · Dec 18, 2025 · Jan 6, 2026 · Jan 13, 2026
diff --git a/ci/test_thirdparty_awkward.sh b/ci/test_thirdparty_awkward.sh
@@ -74,6 +74,21 @@ index 39080a34..0eb3940f 100644
      array = rng.integers(50, size=1000)
 EOF
 
+patch -p1 <<'EOF'
+diff --git a/pyproject.toml b/pyproject.toml
+index 78ecfba9..80a25474 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -136,6 +136,7 @@ filterwarnings = [
+     "ignore:.*np\\.MachAr.*:DeprecationWarning",
+     "ignore:module 'sre_.*' is deprecated:DeprecationWarning",
+     "ignore:Jitify is performing a one-time only warm-up",
++    "ignore:Context.call_conv is deprecated.",
+ ]
+ log_cli_level = "INFO"
+ testpaths = ["tests", "tests-cuda", "tests-cuda-kernels", "tests-cuda-kernels-explicit"]
+EOF
+
 rapids-logger "Generate awkward tests"
 nox -s prepare -- --tests
 

diff --git a/numba_cuda/numba/cuda/cgutils.py b/numba_cuda/numba/cuda/cgutils.py
@@ -764,12 +764,12 @@ def _dbg():
     with if_unlikely(builder, out_of_bounds_upper):
         if config.FULL_TRACEBACKS:
             _dbg()
-        context.call_conv.return_user_exc(builder, IndexError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, IndexError, (msg,))
     out_of_bounds_lower = builder.icmp_signed("<", ind, ind.type(0))
     with if_unlikely(builder, out_of_bounds_lower):
         if config.FULL_TRACEBACKS:
             _dbg()
-        context.call_conv.return_user_exc(builder, IndexError, (msg,))
+        context.fndesc.call_conv.return_user_exc(builder, IndexError, (msg,))
 
 
 def get_item_pointer2(
@@ -936,7 +936,7 @@ def guard_null(context, builder, value, exc_tuple):
     with builder.if_then(is_scalar_zero(builder, value), likely=False):
         exc = exc_tuple[0]
         exc_args = exc_tuple[1:] or None
-        context.call_conv.return_user_exc(builder, exc, exc_args)
+        context.fndesc.call_conv.return_user_exc(builder, exc, exc_args)
 
 
 def guard_memory_error(context, builder, pointer, msg=None):
@@ -946,7 +946,7 @@ def guard_memory_error(context, builder, pointer, msg=None):
     assert isinstance(pointer.type, ir.PointerType), pointer.type
     exc_args = (msg,) if msg else ()
     with builder.if_then(is_null(builder, pointer), likely=False):
-        context.call_conv.return_user_exc(builder, MemoryError, exc_args)
+        context.fndesc.call_conv.return_user_exc(builder, MemoryError, exc_args)
 
 
 @contextmanager

diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
@@ -29,7 +29,7 @@
 from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.flags import CUDAFlags
-from numba.cuda.target import CUDACABICallConv
+from numba.cuda.core.callconv import CUDACABICallConv, CUDACallConv
 from numba.cuda.core.compiler import CompilerBase
 from numba.cuda.core.compiler_machinery import (
     FunctionPass,
@@ -699,6 +699,8 @@ def compile_cuda(
     cc=None,
     max_registers=None,
     lto=False,
+    abi="numba",
+    abi_info=None,
 ):
     if cc is None:
         raise ValueError("Compute Capability must be supplied")
@@ -741,6 +743,12 @@ def compile_cuda(
     flags.max_registers = max_registers
     flags.lto = lto
 
+    if abi == "c":
+        flags.call_conv = CUDACABICallConv(targetctx)
+
+    if abi_info is not None:
+        flags.abi_info = abi_info
+
     with utils.numba_target_override():
         cres = compile_extra(
             typingctx=typingctx,
@@ -759,57 +767,6 @@ def compile_cuda(
     return cres
 
 
-def cabi_wrap_function(
-    context, lib, fndesc, wrapper_function_name, nvvm_options
-):
-    """
-    Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
-
-    The C ABI wrapper will have the same name as the source Python function.
-    """
-    # The wrapper will be contained in a new library that links to the wrapped
-    # function's library
-    library = lib.codegen.create_library(
-        f"{lib.name}_function_",
-        entry_name=wrapper_function_name,
-        nvvm_options=nvvm_options,
-    )
-    library.add_linking_library(lib)
-
-    # Determine the caller (C ABI) and wrapper (Numba ABI) function types
-    argtypes = fndesc.argtypes
-    restype = fndesc.restype
-    c_call_conv = CUDACABICallConv(context)
-    wrapfnty = c_call_conv.get_function_type(restype, argtypes)
-    fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
-
-    # Create a new module and declare the callee
-    wrapper_module = context.create_module("cuda.cabi.wrapper")
-    func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
-
-    # Define the caller - populate it with a call to the callee and return
-    # its return value
-
-    wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
-    builder = ir.IRBuilder(wrapfn.append_basic_block(""))
-
-    arginfo = context.get_arg_packer(argtypes)
-    callargs = arginfo.from_arguments(builder, wrapfn.args)
-    # We get (status, return_value), but we ignore the status since we
-    # can't propagate it through the C ABI anyway
-    _, return_value = context.call_conv.call_function(
-        builder, func, restype, argtypes, callargs
-    )
-    builder.ret(return_value)
-
-    if config.DUMP_LLVM:
-        utils.dump_llvm(fndesc, wrapper_module)
-
-    library.add_ir_module(wrapper_module)
-    library.finalize()
-    return library
-
-
 def kernel_fixup(kernel, debug):
     if debug:
         exc_helper = add_exception_store_helper(kernel)
@@ -934,7 +891,7 @@ def define_error_gv(postfix):
     # Implement status check / exception store logic
 
     status_code = helper_func.args[0]
-    call_conv = cuda_target.target_context.call_conv
+    call_conv = CUDACallConv(cuda_target.target_context)
     status = call_conv._get_return_status(builder, status_code)
 
     # Check error status
@@ -1118,23 +1075,16 @@ def _compile_pyfunc_with_fixup(
         nvvm_options=nvvm_options,
         cc=cc,
         forceinline=forceinline,
+        abi=abi,
+        abi_info=abi_info,
     )
     resty = cres.signature.return_type
 
     if resty and not device and resty != types.void:
         raise TypeError("CUDA kernel must have void return type.")
 
-    tgt = cres.target_context
-
-    if device:
-        lib = cres.library
-        if abi == "c":
-            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
-            lib = cabi_wrap_function(
-                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
-            )
-    else:
-        lib = cres.library
+    lib = cres.library
+    if not device:
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
         lib._entry_name = cres.fndesc.llvm_func_name
         kernel_fixup(kernel, debug)
@@ -1355,7 +1305,9 @@ def compile_ptx_for_current_device(
     )
 
 
-def declare_device_function(name, restype, argtypes, link, use_cooperative):
+def declare_device_function(
+    name, restype, argtypes, link, use_cooperative, abi
+):
     from .descriptor import cuda_target
 
     typingctx = cuda_target.typing_context
@@ -1376,9 +1328,18 @@ def declare_device_function(name, restype, argtypes, link, use_cooperative):
         lib.add_linking_file(file)
     lib.use_cooperative = use_cooperative
 
+    if abi == "numba":
+        call_conv = CUDACallConv(targetctx)
+    elif abi == "c":
+        call_conv = CUDACABICallConv(targetctx)
+    else:
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
+
     # ExternalFunctionDescriptor provides a lowering implementation for calling
     # external functions
-    fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
+    fndesc = funcdesc.ExternalFunctionDescriptor(
+        name, restype, argtypes, call_conv
+    )
     targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
 
     return device_function_template

diff --git a/numba_cuda/numba/cuda/core/base.py b/numba_cuda/numba/cuda/core/base.py
@@ -468,19 +468,6 @@ def get_external_function_type(self, fndesc):
         fnty = llvmir.FunctionType(restype, argtypes)
         return fnty
 
-    def declare_function(self, module, fndesc):
-        fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
-        fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name)
-        self.call_conv.decorate_function(
-            fn, fndesc.args, fndesc.argtypes, noalias=fndesc.noalias
-        )
-        if fndesc.inline:
-            fn.attributes.add("alwaysinline")
-            # alwaysinline overrides optnone
-            fn.attributes.discard("noinline")
-            fn.attributes.discard("optnone")
-        return fn
-
     def declare_external_function(self, module, fndesc):
         fnty = self.get_external_function_type(fndesc)
         fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name)
@@ -975,7 +962,7 @@ def call_internal(self, builder, fndesc, sig, args):
             builder, fndesc, sig, args
         )
         with cgutils.if_unlikely(builder, status.is_error):
-            self.call_conv.return_status_propagate(builder, status)
+            fndesc.call_conv.return_status_propagate(builder, status)
 
         res = imputils.fix_returning_optional(self, builder, sig, status, res)
         return res
@@ -986,8 +973,8 @@ def call_internal_no_propagate(self, builder, fndesc, sig, args):
         """
         # Add call to the generated function
         llvm_mod = builder.module
-        fn = self.declare_function(llvm_mod, fndesc)
-        status, res = self.call_conv.call_function(
+        fn = fndesc.declare_function(llvm_mod)
+        status, res = fndesc.call_conv.call_function(
             builder, fn, sig.return_type, sig.args, args
         )
         return status, res

diff --git a/numba_cuda/numba/cuda/core/callconv.py b/numba_cuda/numba/cuda/core/callconv.py
@@ -3,6 +3,7 @@
 
 from numba.cuda import types
 from numba.cuda import cgutils
+from numba.cuda import itanium_mangler
 from collections import namedtuple
 
 from llvmlite import ir
@@ -161,6 +162,11 @@ def _get_arg_packer(self, argtypes):
         """
         return self.context.get_arg_packer(argtypes)
 
+    def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
+        return itanium_mangler.mangle(
+            name, argtypes, abi_tags=abi_tags, uid=uid
+        )
+
 
 class MinimalCallConv(BaseCallConv):
     """
@@ -341,6 +347,93 @@ def get_exception(self, exc_id):
             return exc, exc_args, locinfo
 
 
+class CUDACallConv(MinimalCallConv):
+    def decorate_function(self, fn, args, fe_argtypes, noalias=False):
+        """
+        Set names and attributes of function arguments.
+        """
+        assert not noalias
+        arginfo = self._get_arg_packer(fe_argtypes)
+        # Do not prefix "arg." on argument name, so that nvvm compiler
+        # can track debug info of argument more accurately
+        arginfo.assign_names(self.get_arguments(fn), args)
+        fn.args[0].name = ".ret"
+
+
+class CUDACABICallConv(BaseCallConv):
+    """
+    Calling convention aimed at matching the CUDA C/C++ ABI. The implemented
+    function signature is:
+
+        <Python return type> (<Python arguments>)
+
+    Exceptions are unsupported in this convention.
+    """
+
+    def _make_call_helper(self, builder):
+        # Call helpers are used to help report exceptions back to Python, so
+        # none is required here.
+        return None
+
+    def return_value(self, builder, retval):
+        return builder.ret(retval)
+
+    def return_user_exc(
+        self, builder, exc, exc_args=None, loc=None, func_name=None
+    ):
+        msg = "Python exceptions are unsupported in the CUDA C/C++ ABI"
+        raise NotImplementedError(msg)
+
+    def return_status_propagate(self, builder, status):
+        msg = "Return status is unsupported in the CUDA C/C++ ABI"
+        raise NotImplementedError(msg)
+
+    def get_function_type(self, restype, argtypes):
+        """
+        Get the LLVM IR Function type for *restype* and *argtypes*.
+        """
+        arginfo = self._get_arg_packer(argtypes)
+        argtypes = list(arginfo.argument_types)
+        fnty = ir.FunctionType(self.get_return_type(restype), argtypes)
+        return fnty
+
+    def decorate_function(self, fn, args, fe_argtypes, noalias=False):
+        """
+        Set names and attributes of function arguments.
+        """
+        assert not noalias
+        arginfo = self._get_arg_packer(fe_argtypes)
+        arginfo.assign_names(self.get_arguments(fn), ["arg." + a for a in args])
+
+    def get_arguments(self, func):
+        """
+        Get the Python-level arguments of LLVM *func*.
+        """
+        return func.args
+
+    def call_function(self, builder, callee, resty, argtys, args):
+        """
+        Call the Numba-compiled *callee*.
+        """
+        arginfo = self._get_arg_packer(argtys)
+        realargs = arginfo.as_arguments(builder, args)
+        code = builder.call(callee, realargs)
+        # No status required as we don't support exceptions or a distinct None
+        # value in a C ABI.
+        status = None
+        out = self.context.get_returned_value(builder, resty, code)
+        return status, out
+
+    def get_return_type(self, ty):
+        return self.context.data_model_manager[ty].get_return_type()
+
+    def mangler(self, name, argtypes, *, abi_tags=None, uid=None):
+        if name.startswith(".NumbaEnv."):
+            func_name = name.split(".")[-1]
+            return f"_ZN08NumbaEnv{func_name}"
+        return name.split(".")[-1]
+
+
 class ErrorModel:
     def __init__(self, call_conv):
         self.call_conv = call_conv
@@ -385,8 +478,8 @@ class NumpyErrorModel(ErrorModel):
 }
 
 
-def create_error_model(model_name, context):
+def create_error_model(model_name, call_conv):
     """
     Create an error model instance for the given target context.
     """
-    return error_models[model_name](context.call_conv)
+    return error_models[model_name](call_conv)
diff --git a/numba_cuda/numba/cuda/core/compiler.py b/numba_cuda/numba/cuda/core/compiler.py
@@ -10,6 +10,8 @@
 from numba.cuda.core.untyped_passes import ExtractByteCode, FixupArgs
 from numba.cuda.core.targetconfig import ConfigStack
 
+from numba.cuda.core.callconv import CUDACallConv
+
 
 class _CompileStatus:
     """
@@ -66,7 +68,11 @@ def _make_subtarget(targetctx, flags):
         subtargetoptions["enable_nrt"] = True
     if flags.fastmath:
         subtargetoptions["fastmath"] = flags.fastmath
-    error_model = callconv.create_error_model(flags.error_model, targetctx)
+
+    # Only the CUDA Calling convention can raise exceptions, so we assume here
+    # that it is suitable for creating the error model.
+    call_conv = CUDACallConv(targetctx)
+    error_model = callconv.create_error_model(flags.error_model, call_conv)
     subtargetoptions["error_model"] = error_model
 
     return targetctx.subtarget(**subtargetoptions)