From f79a7f04297854ea2ab67b6da2d32c6a6d4b12e6 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 20 May 2025 11:31:41 -0500 Subject: [PATCH 01/16] Fix typo: stream._handle -> stream.handle Stream class does not have _handle data member. --- cuda_core/cuda/core/experimental/_launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 7eef9de13d..4451b4571b 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -146,6 +146,6 @@ def launch(stream, config, kernel, *kernel_args): # TODO: check if config has any unsupported attrs handle_return( driver.cuLaunchKernel( - int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream._handle, args_ptr, 0 + int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0 ) ) From de7b3c90b091cb92a57df5c5fd5b82a41d7dc6a0 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 20 May 2025 11:41:47 -0500 Subject: [PATCH 02/16] Move definition of LaunchConfig class to separate file This is necessary to avoid circular dependency. Cluster-related occupancy functions need LaunchConfig. Occupancy functions are defined in _module.py, and _launcher.py that used to house definition of LaunchConfig imports Kernel from _module.py --- cuda_core/cuda/core/experimental/__init__.py | 3 +- .../cuda/core/experimental/_launch_config.py | 97 +++++++++++++++++++ cuda_core/cuda/core/experimental/_launcher.py | 69 +------------ 3 files changed, 102 insertions(+), 67 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_launch_config.py diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 9093163b39..7739ae64e6 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -5,7 +5,8 @@ from cuda.core.experimental import utils from cuda.core.experimental._device import Device from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._launcher import LaunchConfig, launch +from cuda.core.experimental._launch_config import LaunchConfig +from cuda.core.experimental._launcher import launch from cuda.core.experimental._linker import Linker, LinkerOptions from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._program import Program, ProgramOptions diff --git a/cuda_core/cuda/core/experimental/_launch_config.py b/cuda_core/cuda/core/experimental/_launch_config.py new file mode 100644 index 0000000000..bb4e92fb37 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_launch_config.py @@ -0,0 +1,97 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import Optional, Union + +from cuda.core.experimental._device import Device +from cuda.core.experimental._utils.cuda_utils import ( + CUDAError, + cast_to_3_tuple, + driver, + get_binding_version, + handle_return, +) + +# TODO: revisit this treatment for py313t builds +_inited = False + + +def _lazy_init(): + global _inited + if _inited: + return + + global _use_ex + # binding availability depends on cuda-python version + _py_major_minor = get_binding_version() + _driver_ver = handle_return(driver.cuDriverGetVersion()) + _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) + _inited = True + + +@dataclass +class LaunchConfig: + """Customizable launch options. + + Attributes + ---------- + grid : Union[tuple, int] + Collection of threads that will execute a kernel function. + cluster : Union[tuple, int] + Group of blocks (Thread Block Cluster) that will execute on the same + GPU Processing Cluster (GPC). Blocks within a cluster have access to + distributed shared memory and can be explicitly synchronized. + block : Union[tuple, int] + Group of threads (Thread Block) that will execute on the same + streaming multiprocessor (SM). Threads within a thread blocks have + access to shared memory and can be explicitly synchronized. + shmem_size : int, optional + Dynamic shared-memory size per thread block in bytes. + (Default to size 0) + + """ + + # TODO: expand LaunchConfig to include other attributes + grid: Union[tuple, int] = None + cluster: Union[tuple, int] = None + block: Union[tuple, int] = None + shmem_size: Optional[int] = None + + def __post_init__(self): + _lazy_init() + self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid) + self.block = cast_to_3_tuple("LaunchConfig.block", self.block) + # thread block clusters are supported starting H100 + if self.cluster is not None: + if not _use_ex: + err, drvers = driver.cuDriverGetVersion() + drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else "" + raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}") + cc = Device().compute_capability + if cc < (9, 0): + raise CUDAError( + f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" + ) + self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster) + if self.shmem_size is None: + self.shmem_size = 0 + + +def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig: + _lazy_init() + drv_cfg = driver.CUlaunchConfig() + drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid + drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block + drv_cfg.sharedMemBytes = config.shmem_size + attrs = [] # TODO: support more attributes + if config.cluster: + attr = driver.CUlaunchAttribute() + attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION + dim = attr.value.clusterDim + dim.x, dim.y, dim.z = config.cluster + attrs.append(attr) + drv_cfg.numAttrs = len(attrs) + drv_cfg.attrs = attrs + return drv_cfg diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 4451b4571b..72afb5ffd3 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -2,17 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 -from dataclasses import dataclass -from typing import Optional, Union -from cuda.core.experimental._device import Device from cuda.core.experimental._kernel_arg_handler import ParamHolder +from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config from cuda.core.experimental._module import Kernel from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( - CUDAError, - cast_to_3_tuple, check_or_create_options, driver, get_binding_version, @@ -37,54 +33,6 @@ def _lazy_init(): _inited = True -@dataclass -class LaunchConfig: - """Customizable launch options. - - Attributes - ---------- - grid : Union[tuple, int] - Collection of threads that will execute a kernel function. - cluster : Union[tuple, int] - Group of blocks (Thread Block Cluster) that will execute on the same - GPU Processing Cluster (GPC). Blocks within a cluster have access to - distributed shared memory and can be explicitly synchronized. - block : Union[tuple, int] - Group of threads (Thread Block) that will execute on the same - streaming multiprocessor (SM). Threads within a thread blocks have - access to shared memory and can be explicitly synchronized. - shmem_size : int, optional - Dynamic shared-memory size per thread block in bytes. - (Default to size 0) - - """ - - # TODO: expand LaunchConfig to include other attributes - grid: Union[tuple, int] = None - cluster: Union[tuple, int] = None - block: Union[tuple, int] = None - shmem_size: Optional[int] = None - - def __post_init__(self): - _lazy_init() - self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid) - self.block = cast_to_3_tuple("LaunchConfig.block", self.block) - # thread block clusters are supported starting H100 - if self.cluster is not None: - if not _use_ex: - err, drvers = driver.cuDriverGetVersion() - drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else "" - raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}") - cc = Device().compute_capability - if cc < (9, 0): - raise CUDAError( - f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" - ) - self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster) - if self.shmem_size is None: - self.shmem_size = 0 - - def launch(stream, config, kernel, *kernel_args): """Launches a :obj:`~_module.Kernel` object with launch-time configuration. @@ -114,6 +62,7 @@ def launch(stream, config, kernel, *kernel_args): f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})" ) from e assert_type(kernel, Kernel) + _lazy_init() config = check_or_create_options(LaunchConfig, config, "launch config") # TODO: can we ensure kernel_args is valid/safe to use here? @@ -127,20 +76,8 @@ def launch(stream, config, kernel, *kernel_args): # mainly to see if the "Ex" API is available and if so we use it, as it's more feature # rich. if _use_ex: - drv_cfg = driver.CUlaunchConfig() - drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid - drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block + drv_cfg = _to_native_launch_config(config) drv_cfg.hStream = stream.handle - drv_cfg.sharedMemBytes = config.shmem_size - attrs = [] # TODO: support more attributes - if config.cluster: - attr = driver.CUlaunchAttribute() - attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION - dim = attr.value.clusterDim - dim.x, dim.y, dim.z = config.cluster - attrs.append(attr) - drv_cfg.numAttrs = len(attrs) - drv_cfg.attrs = attrs handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) else: # TODO: check if config has any unsupported attrs From 5bd64a76a7a40e8b0746c851aff4223309743f53 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 20 May 2025 11:45:58 -0500 Subject: [PATCH 03/16] Introduce _module.KernelOccupancy class This class defines kernel occupancy query methods. - max_active_blocks_per_multiprocessor - max_potential_block_size - available_dynamic_shared_memory_per_block - max_potential_cluster_size - max_active_clusters Implementation is based on driver API. The following occupancy-related driver functions are not used - `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` - `cuOccupancyMaxPotentialBlockSizeWithFlags` In `cuOccupancyMaxPotentialBlockSize`, only constant dynamic shared-memory size is supported for now. Supporting variable dynamic shared-memory size that depends on the block size is deferred until design is resolved. --- cuda_core/cuda/core/experimental/_module.py | 65 ++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 021165c5cf..8ec03e1018 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -6,6 +6,8 @@ from typing import Optional, Union from warnings import warn +from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config +from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.clear_error_support import ( assert_type, assert_type_str_or_bytes, @@ -184,6 +186,59 @@ def cluster_scheduling_policy_preference(self, device_id: int = None) -> int: ) +class KernelOccupancy: + """ """ + + def __new__(self, *args, **kwargs): + raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.") + + slots = ("_handle",) + + @classmethod + def _init(cls, handle): + self = super().__new__(cls) + self._handle = handle + + return self + + def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int: + """int : Occupancy of the kernel""" + return handle_return( + driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(self._handle, block_size, dynamic_shared_memory_size) + ) + + # FIXME: better docstring needed + def max_potential_block_size(self, dynamic_shared_memory_size: int, block_size_limit: int) -> tuple[int]: + """(int, int): Suggested launch configuration for reasonable occupancy. + + Returns the minimum grid size needed to achieve the maximum occupancy and + the maximum block size that can achieve the maximum occupancy. + """ + return handle_return( + driver.cuOccupancyMaxPotentialBlockSize(self._handle, None, dynamic_shared_memory_size, block_size_limit) + ) + + def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int: + """int: Dynamic shared memory available per block for given launch configuration.""" + return handle_return( + driver.cuOccupancyAvailableDynamicSMemPerBlock(self._handle, num_blocks_per_multiprocessor, block_size) + ) + + def max_potential_cluster_size(self, config: LaunchConfig, stream: Optional[Stream] = None) -> int: + """ "int: The maximum cluster size that can be launched for this kernel and launch configuration""" + drv_cfg = _to_native_launch_config(config) + if stream is not None: + drv_cfg.hStream = stream._handle + return handle_return(driver.cuOccupancyMaxPotentialClusterSize(self._handle, drv_cfg)) + + def max_active_clusters(self, config: LaunchConfig, stream: Optional[Stream] = None) -> int: + """ "int: The maximum number of clusters that could co-exist on the target device""" + drv_cfg = _to_native_launch_config(config) + if stream is not None: + drv_cfg.hStream = stream._handle + return handle_return(driver.cuOccupancyMaxActiveClusters(self._handle, drv_cfg)) + + ParamInfo = namedtuple("ParamInfo", ["offset", "size"]) @@ -198,7 +253,7 @@ class Kernel: """ - __slots__ = ("_handle", "_module", "_attributes") + __slots__ = ("_handle", "_module", "_attributes", "_occupancy") def __new__(self, *args, **kwargs): raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.") @@ -211,6 +266,7 @@ def _from_obj(cls, obj, mod): ker._handle = obj ker._module = mod ker._attributes = None + ker._occupancy = None return ker @property @@ -250,6 +306,13 @@ def arguments_info(self) -> list[ParamInfo]: _, param_info = self._get_arguments_info(param_info=True) return param_info + @property + def occupancy(self) -> KernelOccupancy: + """Get the read-only attributes of this kernel.""" + if self._occupancy is None: + self._occupancy = KernelOccupancy._init(self._handle) + return self._occupancy + # TODO: implement from_handle() From b89c95feebfa0ab8057352c7c4fb0991e8402932 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 20 May 2025 12:58:39 -0500 Subject: [PATCH 04/16] Add occupancy tests, except for cluster-related queries --- cuda_core/tests/test_module.py | 53 +++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index cc86209068..b0c785bc17 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -8,7 +8,7 @@ from conftest import skipif_testing_with_compute_sanitizer import cuda.core.experimental -from cuda.core.experimental import ObjectCode, Program, ProgramOptions, system +from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions, system from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return SAXPY_KERNEL = r""" @@ -40,6 +40,11 @@ def test_kernel_attributes_init_disabled(): cuda.core.experimental._module.KernelAttributes() # Ensure back door is locked. +def test_kernel_occupancy_init_disabled(): + with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."): + cuda.core.experimental._module.KernelOccupancy() # Ensure back door is locked. + + def test_kernel_init_disabled(): with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."): cuda.core.experimental._module.Kernel() # Ensure back door is locked. @@ -245,3 +250,49 @@ def test_num_args_error_handling(deinit_all_contexts_function, cuda12_prerequisi with pytest.raises(CUDAError): # assignment resolves linter error "B018: useless expression" _ = krn.num_arguments + + +@pytest.mark.parametrize("block_size", [32, 64, 96, 120, 128, 256]) +@pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096]) +def test_saxpy_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel, block_size, smem_size_per_block): + kernel, _ = get_saxpy_kernel + dev_props = Device().properties + assert block_size <= dev_props.max_threads_per_block + assert smem_size_per_block <= dev_props.max_shared_memory_per_block + num_blocks_per_sm = kernel.occupancy.max_active_blocks_per_multiprocessor(block_size, smem_size_per_block) + assert isinstance(num_blocks_per_sm, int) + assert num_blocks_per_sm > 0 + kernel_threads_per_sm = num_blocks_per_sm * block_size + kernel_smem_size_per_sm = num_blocks_per_sm * smem_size_per_block + assert kernel_threads_per_sm <= dev_props.max_threads_per_multiprocessor + assert kernel_smem_size_per_sm <= dev_props.max_shared_memory_per_multiprocessor + assert kernel.attributes.num_regs() * num_blocks_per_sm <= dev_props.max_registers_per_multiprocessor + + +@pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 256]) +@pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096]) +def test_saxpy_occupancy_max_potential_block_size(get_saxpy_kernel, block_size_limit, smem_size_per_block): + kernel, _ = get_saxpy_kernel + dev_props = Device().properties + assert block_size_limit <= dev_props.max_threads_per_block + assert smem_size_per_block <= dev_props.max_shared_memory_per_block + config_data = kernel.occupancy.max_potential_block_size(smem_size_per_block, block_size_limit) + assert isinstance(config_data, tuple) + assert len(config_data) == 2 + min_grid_size, max_block_size = config_data + assert isinstance(min_grid_size, int) + assert isinstance(max_block_size, int) + assert min_grid_size > 0 + assert max_block_size > 0 + assert max_block_size <= block_size_limit + + +@pytest.mark.parametrize("num_blocks_per_sm, block_size", [(4, 32), (2, 64), (2, 96), (3, 120), (2, 128), (1, 256)]) +def test_saxpy_occupancy_available_dynamic_shared_memory_per_block(get_saxpy_kernel, num_blocks_per_sm, block_size): + kernel, _ = get_saxpy_kernel + dev_props = Device().properties + assert block_size <= dev_props.max_threads_per_block + assert num_blocks_per_sm * block_size <= dev_props.max_threads_per_multiprocessor + smem_size = kernel.occupancy.available_dynamic_shared_memory_per_block(num_blocks_per_sm, block_size) + assert smem_size <= dev_props.max_shared_memory_per_block + assert num_blocks_per_sm * smem_size <= dev_props.max_shared_memory_per_multiprocessor From 9679e0eb5a324bfa3bb76db5917d634daa9cb1e9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 22 May 2025 15:32:20 -0500 Subject: [PATCH 05/16] Fix type in querying handle from Stream argument --- cuda_core/cuda/core/experimental/_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 8ec03e1018..3df722e219 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -228,14 +228,14 @@ def max_potential_cluster_size(self, config: LaunchConfig, stream: Optional[Stre """ "int: The maximum cluster size that can be launched for this kernel and launch configuration""" drv_cfg = _to_native_launch_config(config) if stream is not None: - drv_cfg.hStream = stream._handle + drv_cfg.hStream = stream.handle return handle_return(driver.cuOccupancyMaxPotentialClusterSize(self._handle, drv_cfg)) def max_active_clusters(self, config: LaunchConfig, stream: Optional[Stream] = None) -> int: """ "int: The maximum number of clusters that could co-exist on the target device""" drv_cfg = _to_native_launch_config(config) if stream is not None: - drv_cfg.hStream = stream._handle + drv_cfg.hStream = stream.handle return handle_return(driver.cuOccupancyMaxActiveClusters(self._handle, drv_cfg)) From ff322ecad154b5b65979b5840d07cd6a60557882 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 22 May 2025 15:33:03 -0500 Subject: [PATCH 06/16] Add tests for cluster-related occupancy descriptors --- cuda_core/tests/test_module.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index b0c785bc17..ef6087101b 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -296,3 +296,32 @@ def test_saxpy_occupancy_available_dynamic_shared_memory_per_block(get_saxpy_ker smem_size = kernel.occupancy.available_dynamic_shared_memory_per_block(num_blocks_per_sm, block_size) assert smem_size <= dev_props.max_shared_memory_per_block assert num_blocks_per_sm * smem_size <= dev_props.max_shared_memory_per_multiprocessor + + +@pytest.mark.parametrize("cluster", [None, 2]) +def test_saxpy_occupancy_max_active_clusters(get_saxpy_kernel, cluster): + kernel, _ = get_saxpy_kernel + dev = Device() + if (cluster) and (dev.compute_capability < (9, 0)): + pytest.skip("Device with compute capability 90 or higher is required for cluster support") + launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64, cluster=cluster) + query_fn = kernel.occupancy.max_active_clusters + max_active_clusters = query_fn(launch_config) + assert isinstance(max_active_clusters, int) + assert max_active_clusters >= 0 + max_active_clusters = query_fn(launch_config, stream=dev.default_stream) + assert isinstance(max_active_clusters, int) + assert max_active_clusters >= 0 + + +def test_saxpy_occupancy_max_potential_cluster_size(get_saxpy_kernel): + kernel, _ = get_saxpy_kernel + dev = Device() + launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64) + query_fn = kernel.occupancy.max_potential_cluster_size + max_potential_cluster_size = query_fn(launch_config) + assert isinstance(max_potential_cluster_size, int) + assert max_potential_cluster_size >= 0 + max_potential_cluster_size = query_fn(launch_config, stream=dev.default_stream) + assert isinstance(max_potential_cluster_size, int) + assert max_potential_cluster_size >= 0 From fd8302f349cfcd2413986e6b6f5634898db2eeff Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 22 May 2025 15:39:25 -0500 Subject: [PATCH 07/16] Introduce MaxPotentialBlockSizeOccupancyResult named tuple Use it as return type for the KernelOccupancy.max_potential_block_size output. --- cuda_core/cuda/core/experimental/_module.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 3df722e219..35eb7ae836 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -186,6 +186,9 @@ def cluster_scheduling_policy_preference(self, device_id: int = None) -> int: ) +MaxPotentialBlockSizeOccupancyResult = namedtuple("MaxPotential", ("min_grid_size", "max_block_size")) + + class KernelOccupancy: """ """ @@ -207,16 +210,18 @@ def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_m driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(self._handle, block_size, dynamic_shared_memory_size) ) - # FIXME: better docstring needed - def max_potential_block_size(self, dynamic_shared_memory_size: int, block_size_limit: int) -> tuple[int]: - """(int, int): Suggested launch configuration for reasonable occupancy. + def max_potential_block_size( + self, dynamic_shared_memory_size: int, block_size_limit: int + ) -> MaxPotentialBlockSizeOccupancyResult: + """MaxPotentialBlockSizeOccupancyResult: Suggested launch configuration for reasonable occupancy. Returns the minimum grid size needed to achieve the maximum occupancy and the maximum block size that can achieve the maximum occupancy. """ - return handle_return( + min_grid_size, max_block_size = handle_return( driver.cuOccupancyMaxPotentialBlockSize(self._handle, None, dynamic_shared_memory_size, block_size_limit) ) + return MaxPotentialBlockSizeOccupancyResult(min_grid_size=min_grid_size, max_block_size=max_block_size) def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int: """int: Dynamic shared memory available per block for given launch configuration.""" From 40d799a77c197978ce396e31bbfa52e79f6e0934 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 22 May 2025 16:34:24 -0500 Subject: [PATCH 08/16] KernelOccupancy.max_potential_block_size support for CUoccupancyB2DSize cuda_utils.driver.CUoccupancyB2DSize type is supported. Required size of dynamic shared memory allocation renamed to dynamic_shared_memory_needed --- cuda_core/cuda/core/experimental/_module.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 35eb7ae836..858fc2aa8e 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -211,16 +211,25 @@ def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_m ) def max_potential_block_size( - self, dynamic_shared_memory_size: int, block_size_limit: int + self, dynamic_shared_memory_needed: Union[int, driver.CUoccupancyB2DSize], block_size_limit: int ) -> MaxPotentialBlockSizeOccupancyResult: """MaxPotentialBlockSizeOccupancyResult: Suggested launch configuration for reasonable occupancy. Returns the minimum grid size needed to achieve the maximum occupancy and the maximum block size that can achieve the maximum occupancy. """ - min_grid_size, max_block_size = handle_return( - driver.cuOccupancyMaxPotentialBlockSize(self._handle, None, dynamic_shared_memory_size, block_size_limit) - ) + if isinstance(dynamic_shared_memory_needed, int): + min_grid_size, max_block_size = handle_return( + driver.cuOccupancyMaxPotentialBlockSize( + self._handle, None, dynamic_shared_memory_needed, block_size_limit + ) + ) + else: + min_grid_size, max_block_size = handle_return( + driver.cuOccupancyMaxPotentialBlockSize( + self._handle, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit + ) + ) return MaxPotentialBlockSizeOccupancyResult(min_grid_size=min_grid_size, max_block_size=max_block_size) def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int: From 5968ff0be277fde73524c44275f3347488ebefe9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 22 May 2025 16:36:43 -0500 Subject: [PATCH 09/16] Add test for B2DSize usage in max_potential_block_size Test requires Numba. If numba is absent, it is skipped, otherwise `numba.cfunc` is used to compile Python function. ctypes.CFuncPtr object obtained from cfunc_res.ctypes is converted to CUoccupancyB2DSize. --- cuda_core/tests/test_module.py | 45 +++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index ef6087101b..06448d48c6 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -11,6 +11,11 @@ from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions, system from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return +try: + import numba +except ImportError: + numba = None + SAXPY_KERNEL = r""" template __global__ void saxpy(const T a, @@ -269,9 +274,10 @@ def test_saxpy_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel, b assert kernel.attributes.num_regs() * num_blocks_per_sm <= dev_props.max_registers_per_multiprocessor -@pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 256]) +@pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 256, 0]) @pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096]) -def test_saxpy_occupancy_max_potential_block_size(get_saxpy_kernel, block_size_limit, smem_size_per_block): +def test_saxpy_occupancy_max_potential_block_size_constant(get_saxpy_kernel, block_size_limit, smem_size_per_block): + """Tests use case when shared memory needed is independent on the block size""" kernel, _ = get_saxpy_kernel dev_props = Device().properties assert block_size_limit <= dev_props.max_threads_per_block @@ -284,7 +290,40 @@ def test_saxpy_occupancy_max_potential_block_size(get_saxpy_kernel, block_size_l assert isinstance(max_block_size, int) assert min_grid_size > 0 assert max_block_size > 0 - assert max_block_size <= block_size_limit + if block_size_limit > 0: + assert max_block_size <= block_size_limit + else: + assert max_block_size <= dev_props.max_threads_per_block + assert min_grid_size == config_data.min_grid_size + assert max_block_size == config_data.max_block_size + + +@pytest.mark.skipif(numba is None, reason="Test requires numba to be installed") +@pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 277, 0]) +def test_saxpy_occupancy_max_potential_block_size_b2dsize(get_saxpy_kernel, block_size_limit): + """Tests use case when shared memory needed depends on the block size""" + kernel, _ = get_saxpy_kernel + + def shared_memory_needed(block_size: numba.intc) -> numba.size_t: + "Size of dynamic shared memory needed by kernel of this block size" + return 1024 * (block_size // 32) + + b2dsize_sig = numba.size_t(numba.intc) + dsmem_needed_cfunc = numba.cfunc(b2dsize_sig)(shared_memory_needed) + fn_ptr = ctypes.cast(dsmem_needed_cfunc.ctypes, ctypes.c_void_p).value + b2dsize_fn = driver.CUoccupancyB2DSize(_ptr=fn_ptr) + config_data = kernel.occupancy.max_potential_block_size(b2dsize_fn, block_size_limit) + dev_props = Device().properties + assert block_size_limit <= dev_props.max_threads_per_block + min_grid_size, max_block_size = config_data + assert isinstance(min_grid_size, int) + assert isinstance(max_block_size, int) + assert min_grid_size > 0 + assert max_block_size > 0 + if block_size_limit > 0: + assert max_block_size <= block_size_limit + else: + assert max_block_size <= dev_props.max_threads_per_block @pytest.mark.parametrize("num_blocks_per_sm, block_size", [(4, 32), (2, 64), (2, 96), (3, 120), (2, 128), (1, 256)]) From 428f4faf0605e2e872cfc6e2243e3231002bab84 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 30 May 2025 10:06:25 -0500 Subject: [PATCH 10/16] Improved max_potential_block_size.__doc__ Expanded the docstring, added advisory about possibility of deadlocks should function encoded CUoccupancyB2DSize require GIL. Added argument type validation for dynamic_shared_memory_needed argument. --- cuda_core/cuda/core/experimental/_module.py | 31 ++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index fe054bb9c1..78157f0d20 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -217,6 +217,30 @@ def max_potential_block_size( Returns the minimum grid size needed to achieve the maximum occupancy and the maximum block size that can achieve the maximum occupancy. + + Parameters + ---------- + dynamic_shared_memory_needed: Union[int, driver.CUoccupancyB2DSize] + The amount of dynamic shared memory in bytes needed by block. + Use `0` if block does not need shared memory. Use C-callable + represented by :obj:`~driver.CUoccupancyB2DSize` to encode + amount of needed dynamic shared memory which varies depending + on tne block size. + block_size_limit: int + Known upper limit on the kernel block size. Use `0` to indicate + the maximum block size permitted by the device / kernel instead + + Returns + ------- + :obj:`~MaxPotentialBlockSizeOccupancyResult` + An object with `min_grid_size` amd `max_block_size` attributes encoding + the suggested launch configuration. + + Note + ---- + Please be advised that use of C-callable that requires Python Global + Interpreter Lock may lead to deadlocks. + """ if isinstance(dynamic_shared_memory_needed, int): min_grid_size, max_block_size = handle_return( @@ -224,12 +248,17 @@ def max_potential_block_size( self._handle, None, dynamic_shared_memory_needed, block_size_limit ) ) - else: + elif isinstance(dynamic_shared_memory_needed, driver.CUoccupancyB2DSize): min_grid_size, max_block_size = handle_return( driver.cuOccupancyMaxPotentialBlockSize( self._handle, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit ) ) + else: + raise TypeError( + "dynamic_shared_memory_needed expected to have type int, or CUoccupancyB2DSize, " + f"got {type(dynamic_shared_memory_needed)}" + ) return MaxPotentialBlockSizeOccupancyResult(min_grid_size=min_grid_size, max_block_size=max_block_size) def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int: From f1ff0f5c9555f2900ef11cb0e65182dab238f290 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 30 May 2025 10:08:34 -0500 Subject: [PATCH 11/16] Add test for dynamic_shared_memory_needed arg of invalid type --- cuda_core/tests/test_module.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index c3f0ea19f0..e0aa0aac7e 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -297,6 +297,9 @@ def test_saxpy_occupancy_max_potential_block_size_constant(get_saxpy_kernel, blo assert max_block_size <= dev_props.max_threads_per_block assert min_grid_size == config_data.min_grid_size assert max_block_size == config_data.max_block_size + invalid_dsmem = Ellipsis + with pytest.raises(TypeError): + kernel.occupancy.max_potential_block_size(invalid_dsmem, block_size_limit) @pytest.mark.skipif(numba is None, reason="Test requires numba to be installed") From 39a08f670d0869e1a2612ea7b3bcee778055c907 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 30 May 2025 10:40:05 -0500 Subject: [PATCH 12/16] Mention feature/occupancy in 0.3.0 release notes --- cuda_core/docs/source/release/0.3.0-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst index eb365b4cbf..df1f21ffd9 100644 --- a/cuda_core/docs/source/release/0.3.0-notes.rst +++ b/cuda_core/docs/source/release/0.3.0-notes.rst @@ -21,6 +21,7 @@ New features ------------ - :class:`Kernel` adds :property:`Kernel.num_arguments` and :property:`Kernel.arguments_info` for introspection of kernel arguments. (#612) +- Add pythonic access to kernel occupancy calculation functions via :property:`Kernel.occupancy`. (#648) New examples ------------ From f74dcf19e50d16f317695cd2e8262253bd57c678 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:34:46 -0500 Subject: [PATCH 13/16] Add symbols to api_private.rst --- cuda_core/docs/source/api_private.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index a38cc2602c..94f5a5df18 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -19,7 +19,11 @@ CUDA runtime _stream.Stream _event.Event _device.DeviceProperties + _launch_config.LaunchConfig _module.KernelAttributes + _module.KernelOccupancy + _module.ParamInfo + _module.MaxPotentialBlockSizeOccupancyResult CUDA compilation toolchain @@ -29,3 +33,4 @@ CUDA compilation toolchain :toctree: generated/ _module.Kernel + _module.ObjectCode From e2adc5750a77dce8b1066a0a58dfd158a76ae374 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:35:23 -0500 Subject: [PATCH 14/16] Reduce test name verbosity Occupancy tests need not contain saxpy in the test name even though it uses saxpy kernel for testing. --- cuda_core/tests/test_module.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index e0aa0aac7e..36b60b3bc2 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -260,7 +260,7 @@ def test_num_args_error_handling(deinit_all_contexts_function, cuda12_prerequisi @pytest.mark.parametrize("block_size", [32, 64, 96, 120, 128, 256]) @pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096]) -def test_saxpy_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel, block_size, smem_size_per_block): +def test_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel, block_size, smem_size_per_block): kernel, _ = get_saxpy_kernel dev_props = Device().properties assert block_size <= dev_props.max_threads_per_block @@ -277,7 +277,7 @@ def test_saxpy_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel, b @pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 256, 0]) @pytest.mark.parametrize("smem_size_per_block", [0, 32, 4096]) -def test_saxpy_occupancy_max_potential_block_size_constant(get_saxpy_kernel, block_size_limit, smem_size_per_block): +def test_occupancy_max_potential_block_size_constant(get_saxpy_kernel, block_size_limit, smem_size_per_block): """Tests use case when shared memory needed is independent on the block size""" kernel, _ = get_saxpy_kernel dev_props = Device().properties @@ -304,7 +304,7 @@ def test_saxpy_occupancy_max_potential_block_size_constant(get_saxpy_kernel, blo @pytest.mark.skipif(numba is None, reason="Test requires numba to be installed") @pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 277, 0]) -def test_saxpy_occupancy_max_potential_block_size_b2dsize(get_saxpy_kernel, block_size_limit): +def test_occupancy_max_potential_block_size_b2dsize(get_saxpy_kernel, block_size_limit): """Tests use case when shared memory needed depends on the block size""" kernel, _ = get_saxpy_kernel @@ -331,7 +331,7 @@ def shared_memory_needed(block_size: numba.intc) -> numba.size_t: @pytest.mark.parametrize("num_blocks_per_sm, block_size", [(4, 32), (2, 64), (2, 96), (3, 120), (2, 128), (1, 256)]) -def test_saxpy_occupancy_available_dynamic_shared_memory_per_block(get_saxpy_kernel, num_blocks_per_sm, block_size): +def test_occupancy_available_dynamic_shared_memory_per_block(get_saxpy_kernel, num_blocks_per_sm, block_size): kernel, _ = get_saxpy_kernel dev_props = Device().properties assert block_size <= dev_props.max_threads_per_block @@ -342,7 +342,7 @@ def test_saxpy_occupancy_available_dynamic_shared_memory_per_block(get_saxpy_ker @pytest.mark.parametrize("cluster", [None, 2]) -def test_saxpy_occupancy_max_active_clusters(get_saxpy_kernel, cluster): +def test_occupancy_max_active_clusters(get_saxpy_kernel, cluster): kernel, _ = get_saxpy_kernel dev = Device() if (cluster) and (dev.compute_capability < (9, 0)): @@ -357,7 +357,7 @@ def test_saxpy_occupancy_max_active_clusters(get_saxpy_kernel, cluster): assert max_active_clusters >= 0 -def test_saxpy_occupancy_max_potential_cluster_size(get_saxpy_kernel): +def test_occupancy_max_potential_cluster_size(get_saxpy_kernel): kernel, _ = get_saxpy_kernel dev = Device() launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64) From 496eb5bf4486da88c4b46019a1dc0fefc637a038 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:36:17 -0500 Subject: [PATCH 15/16] Add doc-strings to KernelOccupancy methods. --- cuda_core/cuda/core/experimental/_module.py | 78 +++++++++++++++++++-- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 78157f0d20..f6a1332b19 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -205,7 +205,30 @@ def _init(cls, handle): return self def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int: - """int : Occupancy of the kernel""" + """Occupancy of the kernel. + + Returns the maximum number of active blocks per multiprocessor for this kernel. + + Parameters + ---------- + block_size: int + Block size parameter used to launch this kernel. + dynamic_shared_memory_size: int + The amount of dynamic shared memory in bytes needed by block. + Use `0` if block does not need shared memory. + + Returns + ------- + int + The maximum number of active blocks per multiprocessor. + + Note + ---- + The fraction of the product of maximum number of active blocks per multiprocessor + and the block size to the maximum number of threads per multiprocessor is known as + theoretical multiprocessor utilization (occupancy). + + """ return handle_return( driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(self._handle, block_size, dynamic_shared_memory_size) ) @@ -262,20 +285,65 @@ def max_potential_block_size( return MaxPotentialBlockSizeOccupancyResult(min_grid_size=min_grid_size, max_block_size=max_block_size) def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int: - """int: Dynamic shared memory available per block for given launch configuration.""" + """Dynamic shared memory available per block for given launch configuration. + + The amount of dynamic shared memory per block, in bytes, for given kernel launch configuration. + + Parameter + --------- + num_blocks_per_multiprocessor: int + Number of blocks to be concurrently executing on a multiprocessor. + block_size: int + Block size parameter used to launch this kernel. + + Returns + ------- + int + Dynamic shared memory available per block for given launch configuration. + """ return handle_return( driver.cuOccupancyAvailableDynamicSMemPerBlock(self._handle, num_blocks_per_multiprocessor, block_size) ) def max_potential_cluster_size(self, config: LaunchConfig, stream: Optional[Stream] = None) -> int: - """ "int: The maximum cluster size that can be launched for this kernel and launch configuration""" + """Maximum potential cluster size. + + The maximum potential cluster size for this kernel and given launch configuration. + + Parameters + ---------- + config: :obj:`~_launch_config.LaunchConfig` + Kernel launch configuration. Cluster dimensions in the configuration are ignored. + stream: :obj:`~Stream`, optional + The stream on which this kernel is to be launched. + + Returns + ------- + int + The maximum cluster size that can be launched for this kernel and launch configuration. + """ drv_cfg = _to_native_launch_config(config) if stream is not None: drv_cfg.hStream = stream.handle return handle_return(driver.cuOccupancyMaxPotentialClusterSize(self._handle, drv_cfg)) def max_active_clusters(self, config: LaunchConfig, stream: Optional[Stream] = None) -> int: - """ "int: The maximum number of clusters that could co-exist on the target device""" + """Maximum number of active clusters on the target device. + + The maximum number of clusters that could concurrently execute on the target device. + + Parameters + ---------- + config: :obj:`~_launch_config.LaunchConfig` + Kernel launch configuration. + stream: :obj:`~Stream`, optional + The stream on which this kernel is to be launched. + + Returns + ------- + int + The maximum number of clusters that could co-exist on the target device. + """ drv_cfg = _to_native_launch_config(config) if stream is not None: drv_cfg.hStream = stream.handle @@ -351,7 +419,7 @@ def arguments_info(self) -> list[ParamInfo]: @property def occupancy(self) -> KernelOccupancy: - """Get the read-only attributes of this kernel.""" + """Get the occupancy information for launching this kernel.""" if self._occupancy is None: self._occupancy = KernelOccupancy._init(self._handle) return self._occupancy From f74db2c263b6add45b5f3d27d93208818f4e3d4a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Jun 2025 23:20:17 -0400 Subject: [PATCH 16/16] fix rendering --- cuda_core/cuda/core/experimental/_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index f6a1332b19..9c80c687b7 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -289,8 +289,8 @@ def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocesso The amount of dynamic shared memory per block, in bytes, for given kernel launch configuration. - Parameter - --------- + Parameters + ---------- num_blocks_per_multiprocessor: int Number of blocks to be concurrently executing on a multiprocessor. block_size: int