NVIDIA · gmarkall · Dec 17, 2025 · Dec 15, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/docs/source/user/globals.rst b/docs/source/user/globals.rst
@@ -0,0 +1,91 @@
+..
+   SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+   SPDX-License-Identifier: BSD-2-Clause
+
+
+.. _cuda-globals:
+
+=====================================
+Global Variables and Captured Values
+=====================================
+
+Numba CUDA kernels and device functions can reference global variables defined
+at module scope. This section describes how these values are captured and the
+implications for your code.
+
+
+Capture as constants
+====================
+
+By default, global variables referenced in kernels are captured as constants at
+compilation time. This applies to scalars and host arrays (e.g. NumPy arrays).
+
+The following example demonstrates this behavior. Both ``TAX_RATE`` and
+``PRICES`` are captured when the kernel is first compiled. Because they are
+embedded as constants, **modifications to these variables after compilation
+have no effect**—the second kernel call still uses the original values:
+
+.. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_globals.py
+   :language: python
+   :caption: Demonstrating constant capture of global variables
+   :start-after: magictoken.ex_globals_constant_capture.begin
+   :end-before: magictoken.ex_globals_constant_capture.end
+   :dedent: 8
+   :linenos:
+
+Running the above code prints:
+
+.. code-block:: text
+
+   Value of d_totals: [ 10.8  54.   16.2  64.8 162. ]
+   Value of d_totals: [ 10.8  54.   16.2  64.8 162. ]
+
+Note that both outputs are identical—the modifications to ``TAX_RATE`` and
+``PRICES`` after the first kernel call have no effect.
+
+This behaviour is useful for small amounts of truly constant data like
+configuration values, lookup tables, or mathematical constants. For larger
+arrays, consider using device arrays instead.
+
+
+Device array capture
+====================
+
+Device arrays are an exception to the constant capture rule. When a kernel
+references a global device array—any object implementing
+``__cuda_array_interface__``, such as CuPy arrays or Numba device arrays—the
+device pointer is captured rather than the data. No copy occurs, and
+modifications to the array **are** visible to subsequent kernel calls.
+
+The following example demonstrates this behavior. The global ``PRICES`` device
+array is mutated after the first kernel call, and the second kernel call sees
+the updated values:
+
+.. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_globals.py
+   :language: python
+   :caption: Demonstrating device array capture by pointer
+   :start-after: magictoken.ex_globals_device_array_capture.begin
+   :end-before: magictoken.ex_globals_device_array_capture.end
+   :dedent: 8
+   :linenos:
+
+Running the above code prints:
+
+.. code-block:: text
+
+   [10. 25.  5. 15. 30.]
+   [20. 50. 10. 30. 60.]
+
+Note that the outputs are different—the mutation to ``PRICES`` after the first
+kernel call *is* visible to the second call, unlike with host arrays.
+
+This makes device arrays suitable for global state that needs to be updated
+between kernel calls without recompilation.
+
+.. note::
+
+   Kernels and device functions that capture global device arrays cannot use
+   ``cache=True``. Because the device pointer is embedded in the compiled code,
+   caching would serialize an invalid pointer. Attempting to cache such a kernel
+   will raise a ``PicklingError``. See :doc:`caching` for more information on
+   kernel caching.
diff --git a/docs/source/user/index.rst b/docs/source/user/index.rst
@@ -14,6 +14,7 @@ User guide
    kernels.rst
    memory.rst
    device-functions.rst
+   globals.rst
    cudapysupported.rst
    fastmath.rst
    intrinsics.rst

diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -12,6 +12,7 @@
 from numba.cuda.memory_management.nrt import NRT_LIBRARY
 
 import os
+import pickle
 import subprocess
 import tempfile
 
@@ -189,6 +190,11 @@ def __init__(
 
         self.use_cooperative = False
 
+        # Objects that need to be kept alive for the lifetime of the
+        # kernels or device functions generated by this code library,
+        # e.g., device arrays captured from global scope.
+        self.referenced_objects = {}
+
     @property
     def llvm_strs(self):
         if self._llvm_strs is None:
@@ -377,6 +383,9 @@ def add_linking_library(self, library):
         self._setup_functions.extend(library._setup_functions)
         self._teardown_functions.extend(library._teardown_functions)
         self.use_cooperative |= library.use_cooperative
+        self.referenced_objects.update(
+            getattr(library, "referenced_objects", {})
+        )
 
     def add_linking_file(self, path_or_obj):
         if isinstance(path_or_obj, LinkableCode):
@@ -442,6 +451,18 @@ def _reduce_states(self):
         but loaded functions are discarded. They are recreated when needed
         after deserialization.
         """
+        # Check for captured device arrays that cannot be safely cached.
+        if self.referenced_objects:
+            if any(
+                getattr(obj, "__cuda_array_interface__", None) is not None
+                for obj in self.referenced_objects.values()
+            ):
+                raise pickle.PicklingError(
+                    "Cannot serialize kernels or device functions referencing "
+                    "global device arrays. Pass the array(s) as arguments "
+                    "to the kernel instead."
+                )
+
         nrt = False
         if self._linking_files:
             if (

diff --git a/numba_cuda/numba/cuda/core/typeinfer.py b/numba_cuda/numba/cuda/core/typeinfer.py
@@ -1738,8 +1738,11 @@ def typeof_global(self, inst, target, gvar):
                 )
 
         if isinstance(typ, types.Array):
-            # Global array in nopython mode is constant
-            typ = typ.copy(readonly=True)
+            # Global array in nopython mode is constant, except for device
+            # arrays implementing __cuda_array_interface__ which are references
+            # to mutable device memory
+            if not hasattr(gvar.value, "__cuda_array_interface__"):
+                typ = typ.copy(readonly=True)
 
         if isinstance(typ, types.BaseAnonymousTuple):
             # if it's a tuple of literal types, swap the type for the more

diff --git a/numba_cuda/numba/cuda/np/arrayobj.py b/numba_cuda/numba/cuda/np/arrayobj.py
@@ -31,6 +31,7 @@
     type_is_scalar,
     lt_complex,
     lt_floats,
+    strides_from_shape,
 )
 from numba.cuda.np.numpy_support import (
     type_can_asarray,
@@ -3642,10 +3643,63 @@ def record_static_setitem_int(context, builder, sig, args):
 def constant_array(context, builder, ty, pyval):
     """
     Create a constant array (mechanism is target-dependent).
+
+    For objects implementing __cuda_array_interface__,
+    the device pointer is embedded directly as a constant. For other arrays,
+    the target-specific mechanism is used.
     """
+    # Check if this is a device array (implements __cuda_array_interface__)
+    if getattr(pyval, "__cuda_array_interface__", None) is not None:
+        return _lower_constant_device_array(context, builder, ty, pyval)
+
     return context.make_constant_array(builder, ty, pyval)
 
 
+def _lower_constant_device_array(context, builder, ty, pyval):
+    """
+    Lower objects with __cuda_array_interface__ by embedding the device
+    pointer as a constant.
+
+    This allows device arrays captured from globals to be used in CUDA
+    kernels and device functions.
+    """
+    interface = pyval.__cuda_array_interface__
+
+    # Hold on to the device array to prevent garbage collection.
+    context.active_code_library.referenced_objects[id(pyval)] = pyval
+
+    shape = interface["shape"]
+    strides = interface.get("strides")
+    data_ptr = interface["data"][0]
+    typestr = interface["typestr"]
+    itemsize = np.dtype(typestr).itemsize
+
+    # Calculate strides if not provided (C-contiguous)
+    if strides is None:
+        strides = strides_from_shape(shape, itemsize, order="C")
+
+    # Embed device pointer as constant
+    llvoidptr = context.get_value_type(types.voidptr)
+    data = context.get_constant(types.uintp, data_ptr).inttoptr(llvoidptr)
+
+    # Build array structure
+    ary = context.make_array(ty)(context, builder)
+    kshape = [context.get_constant(types.intp, s) for s in shape]
+    kstrides = [context.get_constant(types.intp, s) for s in strides]
+
+    context.populate_array(
+        ary,
+        data=builder.bitcast(data, ary.data.type),
+        shape=kshape,
+        strides=kstrides,
+        itemsize=context.get_constant(types.intp, itemsize),
+        parent=None,
+        meminfo=None,
+    )
+
+    return ary._getvalue()
+
+
 @lower_constant(types.Record)
 def constant_record(context, builder, ty, pyval):
     """

diff --git a/numba_cuda/numba/cuda/np/numpy_support.py b/numba_cuda/numba/cuda/np/numpy_support.py
@@ -3,7 +3,10 @@
 
 import collections
 import ctypes
+import itertools
+import operator
 import re
+
 import numpy as np
 
 from numba.cuda import types
@@ -17,6 +20,29 @@
 
 numpy_version = tuple(map(int, np.__version__.split(".")[:2]))
 
+
+def strides_from_shape(
+    shape: tuple[int, ...], itemsize: int, *, order: str
+) -> tuple[int, ...]:
+    """Compute strides for a contiguous array with given shape and order."""
+    if len(shape) == 0:
+        # 0-D arrays have empty strides
+        return ()
+    limits = slice(1, None) if order == "C" else slice(None, -1)
+    transform = reversed if order == "C" else lambda x: x
+    strides = tuple(
+        map(
+            itemsize.__mul__,
+            itertools.accumulate(
+                transform(shape[limits]), operator.mul, initial=1
+            ),
+        )
+    )
+    if order == "F":
+        return strides
+    return strides[::-1]
+
+
 FROM_DTYPE = {
     np.dtype("bool"): types.boolean,
     np.dtype("int8"): types.int8,

diff --git a/numba_cuda/numba/cuda/serialize.py b/numba_cuda/numba/cuda/serialize.py
@@ -197,6 +197,16 @@ def reducer_override(self, obj):
         # Overridden to disable pickling of certain types
         if type(obj) in self.disabled_types:
             _no_pickle(obj)  # noreturn
+
+        # Prevent pickling of objects implementing __cuda_array_interface__
 cvarbytes = dumps(cvars) 
 cvarbytes = dumps(cvars) 
+        # These contain device pointers that would become stale after unpickling
+        if getattr(obj, "__cuda_array_interface__", None) is not None:
+            raise pickle.PicklingError(
+                "Cannot serialize kernels or device functions referencing "
+                "global device arrays. Pass the array(s) as arguments "
+                "to the kernel instead."
+            )
+
         return super().reducer_override(obj)
 
 

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_caching.py b/numba_cuda/numba/cuda/tests/cudapy/test_caching.py
@@ -25,6 +25,11 @@
     temp_directory,
     import_dynamic,
 )
+import numpy as np
+from pickle import PicklingError
+
+# Module-level global for testing that caching rejects global device arrays
+GLOBAL_DEVICE_ARRAY = None
 
 
 class BaseCacheTest(TestCase):
@@ -368,6 +373,48 @@ def test_cannot_cache_linking_libraries(self):
             def f():
                 pass
 
+    def test_cannot_cache_captured_device_array(self):
+        # Test that kernels capturing device arrays from closures cannot
+        # be cached. The error can come from either NumbaPickler (for closure
+        # variables) or CUDACodeLibrary._reduce_states (for referenced objects).
+        host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        captured_arr = cuda.to_device(host_data)
+
+        msg = "global device arrays"
+        with self.assertRaisesRegex(PicklingError, msg):
+
+            @cuda.jit(cache=True)
+            def cached_kernel(output):
+                i = cuda.grid(1)
+                if i < output.size:
+                    output[i] = captured_arr[i] * 2.0
+
+            output = cuda.device_array(3, dtype=np.float32)
+            cached_kernel[1, 3](output)
+
+    def test_cannot_cache_global_device_array(self):
+        # Test that kernels referencing module-level global device arrays
+        # cannot be cached.
+        global GLOBAL_DEVICE_ARRAY
+
+        host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        GLOBAL_DEVICE_ARRAY = cuda.to_device(host_data)
+
+        try:
+            msg = "global device arrays"
+            with self.assertRaisesRegex(PicklingError, msg):
+
+                @cuda.jit(cache=True)
+                def cached_kernel_global(output):
+                    i = cuda.grid(1)
+                    if i < output.size:
+                        output[i] = GLOBAL_DEVICE_ARRAY[i] * 2.0
+
+                output = cuda.device_array(3, dtype=np.float32)
+                cached_kernel_global[1, 3](output)
+        finally:
+            GLOBAL_DEVICE_ARRAY = None
+
 
 @skip_on_cudasim("Simulator does not implement caching")
 class CUDACooperativeGroupTest(DispatcherCacheUsecasesTest):