NVIDIA · isVoid · Apr 11, 2025 · Mar 3, 2025 · Mar 5, 2025 · Mar 13, 2025
diff --git a/docs/source/user/cuda_ffi.rst b/docs/source/user/cuda_ffi.rst
@@ -160,6 +160,27 @@ CUDA C/C++ source code will be compiled with the `NVIDIA Runtime Compiler
 kernel as either PTX or LTOIR, depending on whether LTO is enabled. Other files
 will be passed directly to the CUDA Linker.
 
+A ``LinkableCode`` object may have setup and teardown callback functions that
+perform module-specific initialization and cleanup tasks.
+
+* Setup functions are invoked once for every new module loaded.
+* Teardown functions are invoked just prior to module unloading.
+
+Both setup and teardown callbacks are called with a handle to the relevant
+module. In practice, Numba creates a new module each time a kernel is compiled
+for a specific set of argument types.
+
+For each module, the setup callback is invoked once only. When a module is
+executed by multiple threads, only one thread will execute the setup
+callback.
+
+The callbacks are defined as follows:
+
+.. code::
+
+  def setup_callback(mod: cuda.cudadrv.drvapi.cu_module):...
+  def teardown_callback(mod: cuda.cudadrv.drvapi.cu_module):...
+
 :class:`LinkableCode <numba.cuda.LinkableCode>` objects are initialized using
 the parameters of their base class:
 

diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -4,6 +4,7 @@
 from numba.core.codegen import Codegen, CodeLibrary
 from .cudadrv import devices, driver, nvvm, runtime
 from numba.cuda.cudadrv.libs import get_cudalib
+from numba.cuda.cudadrv.linkable_code import LinkableCode
 
 import os
 import subprocess
@@ -99,6 +100,12 @@ def __init__(
         # Files to link with the generated PTX. These are linked using the
         # Driver API at link time.
         self._linking_files = set()
+        # List of setup functions to the loaded module
+        # the order is determined by the order they are added to the codelib.
+        self._setup_functions = []
+        # List of teardown functions to the loaded module
+        # the order is determined by the order they are added to the codelib.
+        self._teardown_functions = []
         # Should we link libcudadevrt?
         self.needs_cudadevrt = False
 
@@ -251,7 +258,9 @@ def get_cufunc(self):
             return cufunc
 
         cubin = self.get_cubin(cc=device.compute_capability)
-        module = ctx.create_module_image(cubin)
+        module = ctx.create_module_image(
+            cubin, self._setup_functions, self._teardown_functions
+        )
 
         # Load
         cufunc = module.get_function(self._entry_name)
@@ -289,8 +298,14 @@ def add_linking_library(self, library):
 
         self._linking_libraries.add(library)
 
-    def add_linking_file(self, filepath):
-        self._linking_files.add(filepath)
+    def add_linking_file(self, path_or_obj):
+        if isinstance(path_or_obj, LinkableCode):
+            if path_or_obj.setup_callback:
+                self._setup_functions.append(path_or_obj.setup_callback)
+            if path_or_obj.teardown_callback:
+                self._teardown_functions.append(path_or_obj.teardown_callback)
+
+        self._linking_files.add(path_or_obj)
 
     def get_function(self, name):
         for fn in self._module.functions:

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -381,9 +381,6 @@ def _find_api(self, fname):
         else:
             variants = ("_v2", "")
 
-        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
-            return getattr(self.lib, fname)
-
         for variant in variants:
             try:
                 return getattr(self.lib, f"{fname}{variant}")
@@ -1478,8 +1475,12 @@ def create_module_ptx(self, ptx):
             image = c_char_p(ptx)
         return self.create_module_image(image)
 
-    def create_module_image(self, image):
-        module = load_module_image(self, image)
+    def create_module_image(
+        self, image, setup_callbacks=None, teardown_callbacks=None
+    ):
+        module = load_module_image(
+            self, image, setup_callbacks, teardown_callbacks
+        )
         if USE_NV_BINDING:
             key = module.handle
         else:
@@ -1578,17 +1579,25 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
 
-def load_module_image(context, image):
+def load_module_image(
+    context, image, setup_callbacks=None, teardown_callbacks=None
+):
     """
     image must be a pointer
     """
     if USE_NV_BINDING:
-        return load_module_image_cuda_python(context, image)
+        return load_module_image_cuda_python(
+            context, image, setup_callbacks, teardown_callbacks
+        )
     else:
-        return load_module_image_ctypes(context, image)
+        return load_module_image_ctypes(
+            context, image, setup_callbacks, teardown_callbacks
+        )
 
 
-def load_module_image_ctypes(context, image):
+def load_module_image_ctypes(
+    context, image, setup_callbacks, teardown_callbacks
+):
     logsz = config.CUDA_LOG_SIZE
 
     jitinfo = (c_char * logsz)()
@@ -1621,10 +1630,14 @@ def load_module_image_ctypes(context, image):
         handle,
         info_log,
         _module_finalizer(context, handle),
+        setup_callbacks,
+        teardown_callbacks,
     )
 
 
-def load_module_image_cuda_python(context, image):
+def load_module_image_cuda_python(
+    context, image, setup_callbacks, teardown_callbacks
+):
     """
     image must be a pointer
     """
@@ -1661,6 +1674,8 @@ def load_module_image_cuda_python(context, image):
         handle,
         info_log,
         _module_finalizer(context, handle),
+        setup_callbacks,
+        teardown_callbacks,
     )
 
 
@@ -2455,13 +2470,27 @@ def event_elapsed_time(evtstart, evtend):
 class Module(metaclass=ABCMeta):
     """Abstract base class for modules"""
 
-    def __init__(self, context, handle, info_log, finalizer=None):
+    def __init__(
+        self,
+        context,
+        handle,
+        info_log,
+        finalizer=None,
+        setup_callbacks=None,
+        teardown_callbacks=None,
+    ):
         self.context = context
         self.handle = handle
         self.info_log = info_log
         if finalizer is not None:
             self._finalizer = weakref.finalize(self, finalizer)
 
+        self.initialized = False
+        self.setup_functions = setup_callbacks
+        self.teardown_functions = teardown_callbacks
+
+        self._set_finalizers()
+
     def unload(self):
         """Unload this module from the context"""
         self.context.unload_module(self)
@@ -2474,6 +2503,35 @@ def get_function(self, name):
     def get_global_symbol(self, name):
         """Return a MemoryPointer referring to the named symbol"""
 
+    def setup(self):
+        """Call the setup functions for the module"""
+        if self.initialized:
+            raise RuntimeError("The module has already been initialized.")
+
+        if self.setup_functions is None:
+            return
+
+        for f in self.setup_functions:
+            f(self.handle)
+
+        self.initialized = True
+
+    def _set_finalizers(self):
+        """Create finalizers that tear down the module."""
+        if self.teardown_functions is None:
+            return
+
+        def _teardown(teardowns, handle):
+            for f in teardowns:
+                f(handle)
+
+        weakref.finalize(
+            self,
+            _teardown,
+            self.teardown_functions,
+            self.handle,
+        )
+
 
 class CtypesModule(Module):
     def get_function(self, name):

diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
@@ -7,11 +7,26 @@ class LinkableCode:
     :param data: A buffer containing the data to link.
     :param name: The name of the file to be referenced in any compilation or
                  linking errors that may be produced.
+    :param setup_callback: A function called prior to the launch of a kernel
+                           contained within a module that has this code object
+                           linked into it.
+    :param teardown_callback: A function called just prior to the unloading of
+                              a module that has this code object linked into
+                              it.
     """
 
-    def __init__(self, data, name=None):
+    def __init__(
+        self, data, name=None, setup_callback=None, teardown_callback=None
+    ):
+        if setup_callback and not callable(setup_callback):
+            raise TypeError("setup_callback must be callable")
+        if teardown_callback and not callable(teardown_callback):
+            raise TypeError("teardown_callback must be callable")
+
         self.data = data
         self._name = name
+        self.setup_callback = setup_callback
+        self.teardown_callback = teardown_callback
 
     @property
     def name(self):

diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -30,6 +30,7 @@
 )
 from numba.cuda import types as cuda_types
 from numba.cuda.runtime.nrt import rtsys
+from numba.cuda.locks import module_init_lock
 
 from numba import cuda
 from numba import _dispatcher
@@ -347,12 +348,19 @@ def _reduce_states(self):
             extensions=self.extensions,
         )
 
+    @module_init_lock
+    def initialize_once(self, mod):
+        if not mod.initialized:
+            mod.setup()
+
     def bind(self):
         """
         Force binding to current CUDA context
         """
         cufunc = self._codelibrary.get_cufunc()
 
+        self.initialize_once(cufunc.module)
+
         if (
             hasattr(self, "target_context")
             and self.target_context.enable_nrt
@@ -1103,6 +1111,7 @@ def add_overload(self, kernel, argtypes):
         self._insert(c_sig, kernel, cuda=True)
         self.overloads[argtypes] = kernel
 
+    @global_compiler_lock
     def compile(self, sig):
         """
         Compile and bind to the current context a version of this kernel

diff --git a/numba_cuda/numba/cuda/locks.py b/numba_cuda/numba/cuda/locks.py
@@ -0,0 +1,16 @@
+from threading import Lock
+from functools import wraps
+
+# Thread safety guard for module initialization.
+_module_init_lock = Lock()
+
+
+def module_init_lock(func):
+    """Decorator to make sure initialization is invoked once for all threads."""
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with _module_init_lock:
+            return func(*args, **kwargs)
+
+    return wrapper