Skip to content
114 changes: 104 additions & 10 deletions numba_cuda/numba/cuda/debuginfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import abc
import os
from contextlib import contextmanager
from enum import IntEnum

import llvmlite
from llvmlite import ir
Expand Down Expand Up @@ -71,6 +72,16 @@ def _check_polymorphic_debug_info_support():
config.CUDA_DEBUG_POLY_USE_TYPED_CONST = DEBUG_POLY_USE_TYPED_CONST


class DwarfAddressClass(IntEnum):
GENERIC = 0x00
GLOBAL = 0x01
REGISTER = 0x02
CONSTANT = 0x05
LOCAL = 0x06
PARAMETER = 0x07
SHARED = 0x08


@contextmanager
def suspend_emission(builder):
"""Suspends the emission of debug_metadata for the duration of the context
Expand Down Expand Up @@ -179,6 +190,19 @@ def initialize(self):
# constructing subprograms
self.dicompileunit = self._di_compile_unit()

def get_dwarf_address_class(self, addrspace):
# Map NVVM address space to DWARF address class.
from numba.cuda.cudadrv import nvvm

addrspace_to_addrclass_dict = {
nvvm.ADDRSPACE_GENERIC: None,
nvvm.ADDRSPACE_GLOBAL: DwarfAddressClass.GLOBAL,
nvvm.ADDRSPACE_SHARED: DwarfAddressClass.SHARED,
nvvm.ADDRSPACE_CONSTANT: DwarfAddressClass.CONSTANT,
nvvm.ADDRSPACE_LOCAL: DwarfAddressClass.LOCAL,
}
return addrspace_to_addrclass_dict.get(addrspace)

def _var_type(self, lltype, size, datamodel=None):
if self._DEBUG:
print(
Expand Down Expand Up @@ -622,6 +646,11 @@ def __init__(self, module, filepath, cgctx, directives_only):
super().__init__(module, filepath, cgctx, directives_only)
# Cache for local variable metadata type and line deduplication
self._vartypelinemap = {}
# Variable address space dictionary
self._var_addrspace_map = {}

def _set_addrspace_map(self, map):
self._var_addrspace_map = map

def _var_type(self, lltype, size, datamodel=None):
is_bool = False
Expand Down Expand Up @@ -796,6 +825,65 @@ def _var_type(self, lltype, size, datamodel=None):
},
is_distinct=True,
)

# Check if there's actually address space info to handle
addrspace = getattr(self, "_addrspace", None)
if (
isinstance(lltype, ir.LiteralStructType)
and datamodel is not None
and datamodel.inner_models()
and addrspace not in (None, 0)
):
# Process struct with datamodel that has address space info
meta = []
offset = 0
for element, field, model in zip(
lltype.elements, datamodel._fields, datamodel.inner_models()
):
size_field = self.cgctx.get_abi_sizeof(element)
if isinstance(element, ir.PointerType) and field == "data":
# Create pointer type with correct address space
pointee_size = self.cgctx.get_abi_sizeof(element.pointee)
pointee_model = getattr(model, "_pointee_model", None)
pointee_type = self._var_type(
element.pointee, pointee_size, datamodel=pointee_model
)
meta_ptr = {
"tag": ir.DIToken("DW_TAG_pointer_type"),
"baseType": pointee_type,
"size": _BYTE_SIZE * size_field,
}
dwarf_addrclass = self.get_dwarf_address_class(addrspace)
if dwarf_addrclass is not None:
meta_ptr["dwarfAddressSpace"] = int(dwarf_addrclass)
basetype = m.add_debug_info("DIDerivedType", meta_ptr)
else:
basetype = self._var_type(
element, size_field, datamodel=model
)
derived_type = m.add_debug_info(
"DIDerivedType",
{
"tag": ir.DIToken("DW_TAG_member"),
"name": field,
"baseType": basetype,
"size": _BYTE_SIZE * size_field,
"offset": offset,
},
)
meta.append(derived_type)
offset += _BYTE_SIZE * size_field

return m.add_debug_info(
"DICompositeType",
{
"tag": ir.DIToken("DW_TAG_structure_type"),
"name": f"{datamodel.fe_type}",
"elements": m.add_metadata(meta),
"size": offset,
},
is_distinct=True,
)
# For other cases, use upstream Numba implementation
return super()._var_type(lltype, size, datamodel=datamodel)

Expand Down Expand Up @@ -848,16 +936,22 @@ def mark_variable(
# to llvm.dbg.value
return
else:
return super().mark_variable(
builder,
allocavalue,
name,
lltype,
size,
line,
datamodel,
argidx,
)
# Look up address space for this variable
self._addrspace = self._var_addrspace_map.get(name)
try:
return super().mark_variable(
builder,
allocavalue,
name,
lltype,
size,
line,
datamodel,
argidx,
)
finally:
# Clean up address space info
self._addrspace = None

def update_variable(
self,
Expand Down
29 changes: 29 additions & 0 deletions numba_cuda/numba/cuda/lowering.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from numba.cuda import HAS_NUMBA
from numba.cuda.core import ir
from numba.cuda import debuginfo, cgutils, utils, typing, types
from numba import cuda
from numba.cuda.core import (
ir_utils,
targetconfig,
Expand Down Expand Up @@ -1677,10 +1678,31 @@ def decref(self, typ, val):


class CUDALower(Lower):
def _is_shared_array_call(self, fnty):
# Check if function type is a cuda.shared.array call
if not hasattr(fnty, "typing_key"):
return False
return fnty.typing_key is cuda.shared.array

def _lower_call_normal(self, fnty, expr, signature):
# Set flag for subsequent store to track shared address space
if self.context.enable_debuginfo and self._is_shared_array_call(fnty):
self._pending_shared_store = True

return super()._lower_call_normal(fnty, expr, signature)

def storevar(self, value, name, argidx=None):
"""
Store the value into the given variable.
"""
# Track address space for debug info
if self.context.enable_debuginfo and self._pending_shared_store:
from numba.cuda.cudadrv import nvvm

self._addrspace_map[name] = nvvm.ADDRSPACE_SHARED
if not name.startswith("$") and not name.startswith("."):
self._pending_shared_store = False

# Handle polymorphic variables with CUDA_DEBUG_POLY enabled
if config.CUDA_DEBUG_POLY:
src_name = name.split(".")[0]
Expand Down Expand Up @@ -1806,6 +1828,13 @@ def pre_lower(self):
"""
super().pre_lower()

# Track address space for debug info
self._addrspace_map = {}
self._pending_shared_store = False
if self.context.enable_debuginfo:
self.debuginfo._set_addrspace_map(self._addrspace_map)

# Track polymorphic variables for debug info
self.poly_var_typ_map = {}
self.poly_var_loc_map = {}
self.poly_var_set = set()
Expand Down
89 changes: 89 additions & 0 deletions numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from numba.cuda.testing import skip_on_cudasim
from numba import cuda
from numba.cuda import types
from numba.cuda.np import numpy_support
from numba.cuda.testing import CUDATestCase
from numba.cuda.core import config
from textwrap import dedent
Expand Down Expand Up @@ -884,6 +885,94 @@ def foo():
""",
)

# shared_arr -> composite -> elements[4] (data field at index 4) -> pointer with dwarfAddressSpace: 8
# local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace: 8
address_class_filechecks = r"""
CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}dwarfAddressSpace: 8{{.*}}tag: DW_TAG_pointer_type

CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace: 8
"""

def _test_shared_memory_address_class(self, dtype):
"""Test that shared memory arrays have correct DWARF address class.

Shared memory pointers should have addressClass: 8 (DW_AT_address_class
for CUDA shared memory) in their debug metadata, while regular local
arrays should not have this annotation.
"""
sig = (numpy_support.from_dtype(dtype),)

@cuda.jit(sig, debug=True, opt=False)
def kernel_with_shared(data):
shared_arr = cuda.shared.array(32, dtype=dtype)
local_arr = cuda.local.array(32, dtype=dtype)
idx = cuda.grid(1)
if idx < 32:
shared_arr[idx] = data + idx
local_arr[idx] = data * 2 + idx
cuda.syncthreads()
if idx == 0:
result = dtype(0)
for i in range(32):
result += shared_arr[i] + local_arr[i]

llvm_ir = kernel_with_shared.inspect_llvm(sig)

self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)

def test_shared_memory_address_class_int32(self):
self._test_shared_memory_address_class(np.int32)

def test_shared_memory_address_class_complex64(self):
self._test_shared_memory_address_class(np.complex64)

def test_shared_memory_address_class_boolean(self):
self._test_shared_memory_address_class(np.bool)

def test_shared_memory_address_class_float16(self):
self._test_shared_memory_address_class(np.float16)

def test_shared_memory_address_class_record(self):
dtype = np.dtype(
[
("a", np.int32),
("b", np.float32),
]
)
sig = (numpy_support.from_dtype(dtype),)

@cuda.jit(sig, debug=True, opt=False)
def kernel_with_shared(data):
shared_arr = cuda.shared.array(32, dtype=dtype)
local_arr = cuda.local.array(32, dtype=dtype)
result = cuda.local.array(1, dtype=dtype)
idx = cuda.grid(1)
if idx < 32:
shared_arr[idx].a = data.a + idx
local_arr[idx].a = data.a * 2 + idx
shared_arr[idx].b = data.b + idx
local_arr[idx].b = data.b * 2 + idx
cuda.syncthreads()
if idx == 0:
result[0].a = 0
result[0].b = 0.0
for i in range(32):
result[0].a += shared_arr[i].a + local_arr[i].a
result[0].b += shared_arr[i].b + local_arr[i].b

llvm_ir = kernel_with_shared.inspect_llvm(sig)

self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)


if __name__ == "__main__":
unittest.main()
Loading