Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import tilelang.testing
from tilelang.carver.arch.driver.cuda_driver import (
get_cuda_device_properties,
get_device_name,
get_shared_memory_per_block,
get_device_attribute,
get_max_dynamic_shared_size_bytes,
get_persisting_l2_cache_max_size,
get_num_sms,
get_registers_per_block,
)
import torch


class _cudaDeviceAttrNames:
r"""
This struct carries all properties that are of int32_t.
refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
"""

cudaDevAttrMaxThreadsPerBlock: int = 1
cudaDevAttrMaxSharedMemoryPerBlock: int = 8
cudaDevAttrMultiProcessorCount: int = 16
cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
cudaDevAttrMaxPersistingL2CacheSize: int = 108


def test_driver_get_device_properties():
prop = get_cuda_device_properties()
assert prop is not None, "Failed to get CUDA device properties"
assert isinstance(
prop,
torch.cuda._CudaDeviceProperties), ("Returned object is not of type _CudaDeviceProperties")


def test_device_get_device_name():
tl_device_name = get_device_name()
th_device_name = torch.cuda.get_device_name()
assert tl_device_name == th_device_name, "Device names do not match"


def test_device_get_shared_memory_per_block():
tl_smem = get_shared_memory_per_block()
driver_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerBlock)
assert tl_smem == driver_smem, "Shared memory per block values do not match"


def test_device_get_persisting_l2_cache_size():
tl_cache_size = get_persisting_l2_cache_max_size()
driver_cache_size = get_device_attribute(
_cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"


def test_device_get_num_sms():
tl_num_sms = get_num_sms()
driver_num_sms = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMultiProcessorCount)
assert tl_num_sms == driver_num_sms, "Number of SMs do not match"


def test_device_get_registers_per_block():
tl_regs_per_block = get_registers_per_block()
driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock)
assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
Comment on lines +61 to +64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical logic error: Wrong CUDA attribute for registers per block.

Line 63 uses cudaDevAttrMaxThreadsPerBlock to verify get_registers_per_block(), but max threads per block and registers per block are completely different CUDA device properties. The correct attribute should be cudaDevAttrRegsPerBlock (value 12) or cudaDevAttrMaxRegistersPerBlock.

This test will pass but verify the wrong property, masking potential bugs.

 def test_device_get_registers_per_block():
     tl_regs_per_block = get_registers_per_block()
-    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock)
+    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrRegsPerBlock)
     assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"

Also add the constant to the class:

 class _cudaDeviceAttrNames:
     r"""
     This struct carries all properties that are of int32_t.
     refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
     """

     cudaDevAttrMaxThreadsPerBlock: int = 1
     cudaDevAttrMaxSharedMemoryPerBlock: int = 8
     cudaDevAttrMultiProcessorCount: int = 16
+    cudaDevAttrRegsPerBlock: int = 12
     cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
     cudaDevAttrMaxPersistingL2CacheSize: int = 108
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def test_device_get_registers_per_block():
tl_regs_per_block = get_registers_per_block()
driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock)
assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
def test_device_get_registers_per_block():
tl_regs_per_block = get_registers_per_block()
driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrRegsPerBlock)
assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
🤖 Prompt for AI Agents
In testing/python/carver/test_tilelang_carver_cuda_driver_properties.py around
lines 61 to 64, the test incorrectly compares get_registers_per_block() against
cudaDevAttrMaxThreadsPerBlock; replace the driver attribute with the correct
CUDA attribute for registers per block (cudaDevAttrRegsPerBlock or
cudaDevAttrMaxRegistersPerBlock depending on your enum naming) so the test
asserts the actual registers-per-block property, and add the corresponding
constant name/value to the _cudaDeviceAttrNames class/enum used by
get_device_attribute so the test can reference it.



def test_device_get_max_dynamic_shared_size_bytes():
tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
driver_dynamic_smem = get_device_attribute(
_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
assert tl_dynamic_smem == driver_dynamic_smem, (
"Max dynamic shared size bytes values do not match")


if __name__ == "__main__":
tilelang.testing.main()
191 changes: 59 additions & 132 deletions tilelang/carver/arch/driver/cuda_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,123 +2,51 @@
import ctypes
import sys

try:
import torch.cuda._CudaDeviceProperties as _CudaDeviceProperties
except ImportError:
_CudaDeviceProperties = type("DummyCudaDeviceProperties", (), {})

class cudaDeviceProp(ctypes.Structure):
_fields_ = [
("name", ctypes.c_char * 256),
("uuid", ctypes.c_byte * 16), # cudaUUID_t
("luid", ctypes.c_char * 8),
("luidDeviceNodeMask", ctypes.c_uint),
("totalGlobalMem", ctypes.c_size_t),
("sharedMemPerBlock", ctypes.c_size_t),
("regsPerBlock", ctypes.c_int),
("warpSize", ctypes.c_int),
("memPitch", ctypes.c_size_t),
("maxThreadsPerBlock", ctypes.c_int),
("maxThreadsDim", ctypes.c_int * 3),
("maxGridSize", ctypes.c_int * 3),
("clockRate", ctypes.c_int),
("totalConstMem", ctypes.c_size_t),
("major", ctypes.c_int),
("minor", ctypes.c_int),
("textureAlignment", ctypes.c_size_t),
("texturePitchAlignment", ctypes.c_size_t),
("deviceOverlap", ctypes.c_int),
("multiProcessorCount", ctypes.c_int),
("kernelExecTimeoutEnabled", ctypes.c_int),
("integrated", ctypes.c_int),
("canMapHostMemory", ctypes.c_int),
("computeMode", ctypes.c_int),
("maxTexture1D", ctypes.c_int),
("maxTexture1DMipmap", ctypes.c_int),
("maxTexture1DLinear", ctypes.c_int),
("maxTexture2D", ctypes.c_int * 2),
("maxTexture2DMipmap", ctypes.c_int * 2),
("maxTexture2DLinear", ctypes.c_int * 3),
("maxTexture2DGather", ctypes.c_int * 2),
("maxTexture3D", ctypes.c_int * 3),
("maxTexture3DAlt", ctypes.c_int * 3),
("maxTextureCubemap", ctypes.c_int),
("maxTexture1DLayered", ctypes.c_int * 2),
("maxTexture2DLayered", ctypes.c_int * 3),
("maxTextureCubemapLayered", ctypes.c_int * 2),
("maxSurface1D", ctypes.c_int),
("maxSurface2D", ctypes.c_int * 2),
("maxSurface3D", ctypes.c_int * 3),
("maxSurface1DLayered", ctypes.c_int * 2),
("maxSurface2DLayered", ctypes.c_int * 3),
("maxSurfaceCubemap", ctypes.c_int),
("maxSurfaceCubemapLayered", ctypes.c_int * 2),
("surfaceAlignment", ctypes.c_size_t),
("concurrentKernels", ctypes.c_int),
("ECCEnabled", ctypes.c_int),
("pciBusID", ctypes.c_int),
("pciDeviceID", ctypes.c_int),
("pciDomainID", ctypes.c_int),
("tccDriver", ctypes.c_int),
("asyncEngineCount", ctypes.c_int),
("unifiedAddressing", ctypes.c_int),
("memoryClockRate", ctypes.c_int),
("memoryBusWidth", ctypes.c_int),
("l2CacheSize", ctypes.c_int),
("persistingL2CacheMaxSize", ctypes.c_int),
("maxThreadsPerMultiProcessor", ctypes.c_int),
("streamPrioritiesSupported", ctypes.c_int),
("globalL1CacheSupported", ctypes.c_int),
("localL1CacheSupported", ctypes.c_int),
("sharedMemPerMultiprocessor", ctypes.c_size_t),
("regsPerMultiprocessor", ctypes.c_int),
("managedMemory", ctypes.c_int),
("isMultiGpuBoard", ctypes.c_int),
("multiGpuBoardGroupID", ctypes.c_int),
("reserved2", ctypes.c_int * 2),
("reserved1", ctypes.c_int * 1),
("reserved", ctypes.c_int * 60)
]


def get_cuda_device_properties(device_id: int = 0) -> cudaDeviceProp | None:

if sys.platform == "win32":
libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
else:
libcudart = ctypes.cdll.LoadLibrary("libcudart.so")

prop = cudaDeviceProp()
cudaGetDeviceProperties = libcudart.cudaGetDeviceProperties
cudaGetDeviceProperties.argtypes = [ctypes.POINTER(cudaDeviceProp), ctypes.c_int]
cudaGetDeviceProperties.restype = ctypes.c_int
ret = cudaGetDeviceProperties(ctypes.byref(prop), device_id)
if ret == 0:
return prop
else:
raise RuntimeError(f"cudaGetDeviceProperties failed with error {ret}")

class cudaDeviceAttrNames:
r"""
refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
"""

cudaDevAttrMaxThreadsPerBlock: int = 1
cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
cudaDevAttrMaxPersistingL2CacheSize: int = 108


def get_cuda_device_properties(device_id: int = 0) -> _CudaDeviceProperties | None:
try:
import torch.cuda

if not torch.cuda.is_available():
return None
return torch.cuda.get_device_properties(torch.device(device_id))
except ImportError:
return None


def get_device_name(device_id: int = 0) -> str | None:
prop = get_cuda_device_properties(device_id)
if prop:
return prop.name.decode()
else:
raise RuntimeError("Failed to get device properties.")
return prop.name


def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> int | None:
assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
prop = get_cuda_device_properties(device_id)
if prop:
# Convert size_t to int to avoid overflow issues
shared_mem = int(prop.sharedMemPerBlock)
if format == "bytes":
return shared_mem
elif format == "kb":
return shared_mem // 1024
elif format == "mb":
return shared_mem // (1024 * 1024)
else:
raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
shared_mem = int(prop.shared_memory_per_block)
if format == "bytes":
return shared_mem
elif format == "kb":
return shared_mem // 1024
elif format == "mb":
return shared_mem // (1024 * 1024)
else:
raise RuntimeError("Failed to get device properties.")
raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
Comment on lines 38 to +49
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Missing None check for device properties.

Line 41 accesses prop.shared_memory_per_block without checking if prop is None. When CUDA is unavailable or get_cuda_device_properties returns None, this will raise an AttributeError.

Compare with get_num_sms (lines 114-115) which correctly checks for None.

 def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> int | None:
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
     prop = get_cuda_device_properties(device_id)
+    if prop is None:
+        return None
     shared_mem = int(prop.shared_memory_per_block)
     if format == "bytes":
         return shared_mem
🧰 Tools
🪛 Ruff (0.14.7)

49-49: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In tilelang/carver/arch/driver/cuda_driver.py around lines 38 to 49, the code
dereferences prop.shared_memory_per_block without checking if prop is None (CUDA
unavailable), causing an AttributeError; update the function to mirror
get_num_sms by first checking if prop is None and returning None immediately,
otherwise compute shared_mem and convert based on format; keep the existing
format assertion and final RuntimeError path unchanged.



def get_device_attribute(attr: int, device_id: int = 0) -> int:
Expand All @@ -130,7 +58,11 @@ def get_device_attribute(attr: int, device_id: int = 0) -> int:

value = ctypes.c_int()
cudaDeviceGetAttribute = libcudart.cudaDeviceGetAttribute
cudaDeviceGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
cudaDeviceGetAttribute.argtypes = [
ctypes.POINTER(ctypes.c_int),
ctypes.c_int,
ctypes.c_int,
]
cudaDeviceGetAttribute.restype = ctypes.c_int

ret = cudaDeviceGetAttribute(ctypes.byref(value), attr, device_id)
Expand All @@ -148,28 +80,21 @@ def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes")
Get the maximum dynamic shared memory size in bytes, kilobytes, or megabytes.
"""
assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
prop = get_cuda_device_properties(device_id)
if prop:
# Convert size_t to int to avoid overflow issues
shared_mem = int(prop.sharedMemPerMultiprocessor)
if format == "bytes":
return shared_mem
elif format == "kb":
return shared_mem // 1024
elif format == "mb":
return shared_mem // (1024 * 1024)
else:
raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
shared_mem = get_device_attribute(
cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor, device_id)
if format == "bytes":
return shared_mem
elif format == "kb":
return shared_mem // 1024
elif format == "mb":
return shared_mem // (1024 * 1024)
else:
raise RuntimeError("Failed to get device properties.")
raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")


def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
prop = get_cuda_device_properties(device_id)
if prop:
return prop.persistingL2CacheMaxSize
else:
raise RuntimeError("Failed to get device properties for persisting L2 cache max size.")
prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
return prop
Comment on lines 95 to +97
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Major: Type inconsistency and missing None handling.

The function declares return type int, but get_device_attribute can return None (line 75). This creates a type mismatch and potential runtime issues for callers expecting an int.

Either update the return type and add explicit handling:

-def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
+def get_persisting_l2_cache_max_size(device_id: int = 0) -> int | None:
     prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
     return prop

Or add error handling with a default:

 def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
     prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
+    if prop is None:
+        raise RuntimeError("Failed to get persisting L2 cache size.")
     return prop
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
prop = get_cuda_device_properties(device_id)
if prop:
return prop.persistingL2CacheMaxSize
else:
raise RuntimeError("Failed to get device properties for persisting L2 cache max size.")
prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
return prop
def get_persisting_l2_cache_max_size(device_id: int = 0) -> int | None:
prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
return prop
🤖 Prompt for AI Agents
tilelang/carver/arch/driver/cuda_driver.py lines 95-97: get_device_attribute can
return None but the function is annotated to return int, so either update the
signature to return Optional[int] and return prop as-is, or keep int and add
explicit None handling (raise a clear exception like ValueError or return a
documented default) before returning; choose one approach, update the type hint
and callers/docs accordingly, and ensure no path returns None while annotated as
int.



def get_num_sms(device_id: int = 0) -> int:
Expand All @@ -186,15 +111,17 @@ def get_num_sms(device_id: int = 0) -> int:
RuntimeError: If unable to get the device properties.
"""
prop = get_cuda_device_properties(device_id)
if prop:
return prop.multiProcessorCount
else:
if prop is None:
raise RuntimeError("Failed to get device properties.")
return prop.multi_processor_count


def get_registers_per_block(device_id: int = 0) -> int:
prop = get_cuda_device_properties(device_id)
if prop:
return prop.regsPerBlock
else:
raise RuntimeError("Failed to get device properties.")
"""
Get the maximum number of 32-bit registers available per block.
"""
prop = get_device_attribute(
cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock,
device_id,
)
return prop
Comment on lines 119 to +127
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Wrong CUDA attribute - returns max threads instead of registers.

The function name is get_registers_per_block but it queries cudaDevAttrMaxThreadsPerBlock. These are distinct CUDA device properties:

  • Max threads per block: Maximum number of threads that can be in a block
  • Registers per block: Maximum number of 32-bit registers available per block

The correct attribute should be cudaDevAttrRegsPerBlock (value 12).

This critical bug exists in both the driver and test file, which is why the tests pass despite returning the wrong value.

 def get_registers_per_block(device_id: int = 0) -> int:
     """
     Get the maximum number of 32-bit registers available per block.
     """
     prop = get_device_attribute(
-        cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock,
+        cudaDeviceAttrNames.cudaDevAttrRegsPerBlock,
         device_id,
     )
+    if prop is None:
+        raise RuntimeError("Failed to get registers per block.")
     return prop

Also add the constant to the class:

 class cudaDeviceAttrNames:
     r"""
     refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
     """

     cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrRegsPerBlock: int = 12
     cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
     cudaDevAttrMaxPersistingL2CacheSize: int = 108

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In tilelang/carver/arch/driver/cuda_driver.py around lines 119 to 127, the
function get_registers_per_block incorrectly queries
cudaDevAttrMaxThreadsPerBlock; change it to query cudaDevAttrRegsPerBlock (CUDA
attribute value 12) so it returns the number of 32-bit registers per block
rather than max threads, and update any related test to assert the correct
attribute; additionally add the constant cudaDevAttrRegsPerBlock to the device
attribute names class/enum where other cudaDevAttr* constants are defined so the
name is available for use.

Loading