NVIDIA · gmarkall · Dec 2, 2025 · Nov 17, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/numba_cuda/numba/cuda/debuginfo.py b/numba_cuda/numba/cuda/debuginfo.py
@@ -4,6 +4,7 @@
 import abc
 import os
 from contextlib import contextmanager
+from enum import IntEnum
 
 import llvmlite
 from llvmlite import ir
@@ -71,6 +72,16 @@ def _check_polymorphic_debug_info_support():
     config.CUDA_DEBUG_POLY_USE_TYPED_CONST = DEBUG_POLY_USE_TYPED_CONST
 
 
+class DwarfAddressClass(IntEnum):
+    GENERIC = 0x00
+    GLOBAL = 0x01
+    REGISTER = 0x02
+    CONSTANT = 0x05
+    LOCAL = 0x06
+    PARAMETER = 0x07
+    SHARED = 0x08
+
+
 @contextmanager
 def suspend_emission(builder):
     """Suspends the emission of debug_metadata for the duration of the context
@@ -179,6 +190,19 @@ def initialize(self):
         # constructing subprograms
         self.dicompileunit = self._di_compile_unit()
 
+    def get_dwarf_address_class(self, addrspace):
+        # Map NVVM address space to DWARF address class.
+        from numba.cuda.cudadrv import nvvm
+
+        addrspace_to_addrclass_dict = {
+            nvvm.ADDRSPACE_GENERIC: None,
+            nvvm.ADDRSPACE_GLOBAL: DwarfAddressClass.GLOBAL,
+            nvvm.ADDRSPACE_SHARED: DwarfAddressClass.SHARED,
+            nvvm.ADDRSPACE_CONSTANT: DwarfAddressClass.CONSTANT,
+            nvvm.ADDRSPACE_LOCAL: DwarfAddressClass.LOCAL,
+        }
+        return addrspace_to_addrclass_dict.get(addrspace)
+
     def _var_type(self, lltype, size, datamodel=None):
         if self._DEBUG:
             print(
@@ -622,6 +646,11 @@ def __init__(self, module, filepath, cgctx, directives_only):
         super().__init__(module, filepath, cgctx, directives_only)
         # Cache for local variable metadata type and line deduplication
         self._vartypelinemap = {}
+        # Variable address space dictionary
+        self._var_addrspace_map = {}
+
+    def _set_addrspace_map(self, map):
+        self._var_addrspace_map = map
 
     def _var_type(self, lltype, size, datamodel=None):
         is_bool = False
@@ -796,6 +825,65 @@ def _var_type(self, lltype, size, datamodel=None):
                     },
                     is_distinct=True,
                 )
+
+        # Check if there's actually address space info to handle
+        addrspace = getattr(self, "_addrspace", None)
+        if (
+            isinstance(lltype, ir.LiteralStructType)
+            and datamodel is not None
+            and datamodel.inner_models()
+            and addrspace not in (None, 0)
+        ):
+            # Process struct with datamodel that has address space info
+            meta = []
+            offset = 0
+            for element, field, model in zip(
+                lltype.elements, datamodel._fields, datamodel.inner_models()
+            ):
+                size_field = self.cgctx.get_abi_sizeof(element)
+                if isinstance(element, ir.PointerType) and field == "data":
+                    # Create pointer type with correct address space
+                    pointee_size = self.cgctx.get_abi_sizeof(element.pointee)
+                    pointee_model = getattr(model, "_pointee_model", None)
+                    pointee_type = self._var_type(
+                        element.pointee, pointee_size, datamodel=pointee_model
+                    )
+                    meta_ptr = {
+                        "tag": ir.DIToken("DW_TAG_pointer_type"),
+                        "baseType": pointee_type,
+                        "size": _BYTE_SIZE * size_field,
+                    }
+                    dwarf_addrclass = self.get_dwarf_address_class(addrspace)
+                    if dwarf_addrclass is not None:
+                        meta_ptr["dwarfAddressSpace"] = int(dwarf_addrclass)
+                    basetype = m.add_debug_info("DIDerivedType", meta_ptr)
+                else:
+                    basetype = self._var_type(
+                        element, size_field, datamodel=model
+                    )
+                derived_type = m.add_debug_info(
+                    "DIDerivedType",
+                    {
+                        "tag": ir.DIToken("DW_TAG_member"),
+                        "name": field,
+                        "baseType": basetype,
+                        "size": _BYTE_SIZE * size_field,
+                        "offset": offset,
+                    },
+                )
+                meta.append(derived_type)
+                offset += _BYTE_SIZE * size_field
+
+            return m.add_debug_info(
+                "DICompositeType",
+                {
+                    "tag": ir.DIToken("DW_TAG_structure_type"),
+                    "name": f"{datamodel.fe_type}",
+                    "elements": m.add_metadata(meta),
+                    "size": offset,
+                },
+                is_distinct=True,
+            )
         # For other cases, use upstream Numba implementation
         return super()._var_type(lltype, size, datamodel=datamodel)
 
@@ -848,16 +936,22 @@ def mark_variable(
                 # to llvm.dbg.value
                 return
             else:
-                return super().mark_variable(
-                    builder,
-                    allocavalue,
-                    name,
-                    lltype,
-                    size,
-                    line,
-                    datamodel,
-                    argidx,
-                )
+                # Look up address space for this variable
+                self._addrspace = self._var_addrspace_map.get(name)
+                try:
+                    return super().mark_variable(
+                        builder,
+                        allocavalue,
+                        name,
+                        lltype,
+                        size,
+                        line,
+                        datamodel,
+                        argidx,
+                    )
+                finally:
+                    # Clean up address space info
+                    self._addrspace = None
 
     def update_variable(
         self,

diff --git a/numba_cuda/numba/cuda/lowering.py b/numba_cuda/numba/cuda/lowering.py
@@ -11,6 +11,7 @@
 from numba.cuda import HAS_NUMBA
 from numba.cuda.core import ir
 from numba.cuda import debuginfo, cgutils, utils, typing, types
+from numba import cuda
 from numba.cuda.core import (
     ir_utils,
     targetconfig,
@@ -1677,10 +1678,31 @@ def decref(self, typ, val):
 
 
 class CUDALower(Lower):
+    def _is_shared_array_call(self, fnty):
+        # Check if function type is a cuda.shared.array call
+        if not hasattr(fnty, "typing_key"):
+            return False
+        return fnty.typing_key is cuda.shared.array
+
+    def _lower_call_normal(self, fnty, expr, signature):
+        # Set flag for subsequent store to track shared address space
+        if self.context.enable_debuginfo and self._is_shared_array_call(fnty):
+            self._pending_shared_store = True
+
+        return super()._lower_call_normal(fnty, expr, signature)
+
     def storevar(self, value, name, argidx=None):
         """
         Store the value into the given variable.
         """
+        # Track address space for debug info
+        if self.context.enable_debuginfo and self._pending_shared_store:
+            from numba.cuda.cudadrv import nvvm
+
+            self._addrspace_map[name] = nvvm.ADDRSPACE_SHARED
+            if not name.startswith("$") and not name.startswith("."):
+                self._pending_shared_store = False
+
         # Handle polymorphic variables with CUDA_DEBUG_POLY enabled
         if config.CUDA_DEBUG_POLY:
             src_name = name.split(".")[0]
@@ -1806,6 +1828,13 @@ def pre_lower(self):
         """
         super().pre_lower()
 
+        # Track address space for debug info
+        self._addrspace_map = {}
+        self._pending_shared_store = False
+        if self.context.enable_debuginfo:
+            self.debuginfo._set_addrspace_map(self._addrspace_map)
+
+        # Track polymorphic variables for debug info
         self.poly_var_typ_map = {}
         self.poly_var_loc_map = {}
         self.poly_var_set = set()

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py b/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py
@@ -6,6 +6,7 @@
 from numba.cuda.testing import skip_on_cudasim
 from numba import cuda
 from numba.cuda import types
+from numba.cuda.np import numpy_support
 from numba.cuda.testing import CUDATestCase
 from numba.cuda.core import config
 from textwrap import dedent
@@ -884,6 +885,94 @@ def foo():
         """,
         )
 
+    # shared_arr -> composite -> elements[4] (data field at index 4) -> pointer with dwarfAddressSpace: 8
+    # local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace: 8
+    address_class_filechecks = r"""
+        CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
+        CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
+        CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
+        CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
+        CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}dwarfAddressSpace: 8{{.*}}tag: DW_TAG_pointer_type
+
+        CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
+        CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
+        CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
+        CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
+        CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
+        CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace: 8
+    """
+
+    def _test_shared_memory_address_class(self, dtype):
+        """Test that shared memory arrays have correct DWARF address class.
+
+        Shared memory pointers should have addressClass: 8 (DW_AT_address_class
+        for CUDA shared memory) in their debug metadata, while regular local
+        arrays should not have this annotation.
+        """
+        sig = (numpy_support.from_dtype(dtype),)
+
+        @cuda.jit(sig, debug=True, opt=False)
+        def kernel_with_shared(data):
+            shared_arr = cuda.shared.array(32, dtype=dtype)
+            local_arr = cuda.local.array(32, dtype=dtype)
+            idx = cuda.grid(1)
+            if idx < 32:
+                shared_arr[idx] = data + idx
+                local_arr[idx] = data * 2 + idx
+            cuda.syncthreads()
+            if idx == 0:
+                result = dtype(0)
+                for i in range(32):
+                    result += shared_arr[i] + local_arr[i]
+
+        llvm_ir = kernel_with_shared.inspect_llvm(sig)
+
+        self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
+
+    def test_shared_memory_address_class_int32(self):
+        self._test_shared_memory_address_class(np.int32)
+
+    def test_shared_memory_address_class_complex64(self):
+        self._test_shared_memory_address_class(np.complex64)
+
+    def test_shared_memory_address_class_boolean(self):
+        self._test_shared_memory_address_class(np.bool)
+
+    def test_shared_memory_address_class_float16(self):
+        self._test_shared_memory_address_class(np.float16)
+
+    def test_shared_memory_address_class_record(self):
+        dtype = np.dtype(
+            [
+                ("a", np.int32),
+                ("b", np.float32),
+            ]
+        )
+        sig = (numpy_support.from_dtype(dtype),)
+
+        @cuda.jit(sig, debug=True, opt=False)
+        def kernel_with_shared(data):
+            shared_arr = cuda.shared.array(32, dtype=dtype)
+            local_arr = cuda.local.array(32, dtype=dtype)
+            result = cuda.local.array(1, dtype=dtype)
+            idx = cuda.grid(1)
+            if idx < 32:
+                shared_arr[idx].a = data.a + idx
+                local_arr[idx].a = data.a * 2 + idx
+                shared_arr[idx].b = data.b + idx
+                local_arr[idx].b = data.b * 2 + idx
+            cuda.syncthreads()
+            if idx == 0:
+                result[0].a = 0
+                result[0].b = 0.0
+                for i in range(32):
+                    result[0].a += shared_arr[i].a + local_arr[i].a
+                    result[0].b += shared_arr[i].b + local_arr[i].b
+
+        llvm_ir = kernel_with_shared.inspect_llvm(sig)
+
+        self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
+
 
 if __name__ == "__main__":
     unittest.main()