diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 55afb0ebab..ab31b014e8 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -701,6 +701,17 @@ def can_use_host_pointer_for_registered_mem(self) -> bool: ) ) + # TODO: A few attrs are missing here (NVIDIA/cuda-python#675) + + @property + def cooperative_launch(self) -> bool: + """ + True if device supports launching cooperative kernels, False if not. + """ + return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH)) + + # TODO: A few attrs are missing here (NVIDIA/cuda-python#675) + @property def max_shared_memory_per_block_optin(self) -> int: """ diff --git a/cuda_core/cuda/core/experimental/_launch_config.py b/cuda_core/cuda/core/experimental/_launch_config.py index bb4e92fb37..43b7c21095 100644 --- a/cuda_core/cuda/core/experimental/_launch_config.py +++ b/cuda_core/cuda/core/experimental/_launch_config.py @@ -58,11 +58,15 @@ class LaunchConfig: cluster: Union[tuple, int] = None block: Union[tuple, int] = None shmem_size: Optional[int] = None + cooperative_launch: Optional[bool] = False def __post_init__(self): _lazy_init() self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid) self.block = cast_to_3_tuple("LaunchConfig.block", self.block) + # FIXME: Calling Device() strictly speaking is not quite right; we should instead + # look up the device from stream. We probably need to defer the checks related to + # device compute capability or attributes. # thread block clusters are supported starting H100 if self.cluster is not None: if not _use_ex: @@ -77,6 +81,8 @@ def __post_init__(self): self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster) if self.shmem_size is None: self.shmem_size = 0 + if self.cooperative_launch and not Device().properties.cooperative_launch: + raise CUDAError("cooperative kernels are not supported on this device") def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig: @@ -92,6 +98,11 @@ def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig: dim = attr.value.clusterDim dim.x, dim.y, dim.z = config.cluster attrs.append(attr) + if config.cooperative_launch: + attr = driver.CUlaunchAttribute() + attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE + attr.value.cooperative = 1 + attrs.append(attr) drv_cfg.numAttrs = len(attrs) drv_cfg.attrs = attrs return drv_cfg diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 72afb5ffd3..1177d60340 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -9,6 +9,7 @@ from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( + _reduce_3_tuple, check_or_create_options, driver, get_binding_version, @@ -78,6 +79,8 @@ def launch(stream, config, kernel, *kernel_args): if _use_ex: drv_cfg = _to_native_launch_config(config) drv_cfg.hStream = stream.handle + if config.cooperative_launch: + _check_cooperative_launch(kernel, config, stream) handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) else: # TODO: check if config has any unsupported attrs @@ -86,3 +89,16 @@ def launch(stream, config, kernel, *kernel_args): int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0 ) ) + + +def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): + dev = stream.device + num_sm = dev.properties.multiprocessor_count + max_grid_size = ( + kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_3_tuple(config.block), config.shmem_size) * num_sm + ) + if _reduce_3_tuple(config.grid) > max_grid_size: + # For now let's try not to be smart and adjust the grid size behind users' back. + # We explicitly ask users to adjust. + x, y, z = config.grid + raise ValueError(f"The specified grid size ({x} * {y} * {z}) exceeds the limit ({max_grid_size})") diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.py b/cuda_core/cuda/core/experimental/_utils/cuda_utils.py index 7cf9be31dd..48b48d2fb8 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.py +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.py @@ -48,6 +48,10 @@ def cast_to_3_tuple(label, cfg): return cfg + (1,) * (3 - len(cfg)) +def _reduce_3_tuple(t: tuple): + return t[0] * t[1] * t[2] + + def _check_driver_error(error): if error == driver.CUresult.CUDA_SUCCESS: return diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst index 88a028b51a..2d32a38906 100644 --- a/cuda_core/docs/source/release/0.3.0-notes.rst +++ b/cuda_core/docs/source/release/0.3.0-notes.rst @@ -22,6 +22,7 @@ New features - :class:`Kernel` adds :property:`Kernel.num_arguments` and :property:`Kernel.arguments_info` for introspection of kernel arguments. (#612) - Add pythonic access to kernel occupancy calculation functions via :property:`Kernel.occupancy`. (#648) +- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`. New examples ------------ @@ -31,4 +32,4 @@ Fixes and enhancements ---------------------- - An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively. -- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed +- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed. diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 4d615d73da..b37093eb0d 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -69,3 +69,7 @@ def pop_all_contexts(): os.environ.get("CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER", "0") == "1", reason="The compute-sanitizer is running, and this test causes an API error.", ) + + +# TODO: make the fixture more sophisticated using path finder +skipif_need_cuda_headers = pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need CUDA header") diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index 309d661f50..e59ca56b5e 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -191,6 +191,7 @@ def test_compute_capability(): ("concurrent_managed_access", bool), ("compute_preemption_supported", bool), ("can_use_host_pointer_for_registered_mem", bool), + ("cooperative_launch", bool), ("max_shared_memory_per_block_optin", int), ("pageable_memory_access_uses_host_page_tables", bool), ("direct_managed_mem_access_from_host", bool), diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index eb954070e9..45d8895a06 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from conftest import skipif_testing_with_compute_sanitizer +from conftest import skipif_need_cuda_headers, skipif_testing_with_compute_sanitizer import cuda.core.experimental from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch @@ -114,9 +114,8 @@ def test_error_timing_recorded(): event3 - event2 -# TODO: improve this once path finder can find headers @skipif_testing_with_compute_sanitizer -@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header") +@skipif_need_cuda_headers # libcu++ @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") def test_error_timing_incomplete(): device = Device() diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 9c72693a1b..635d1fcf5c 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -7,6 +7,7 @@ import numpy as np import pytest +from conftest import skipif_need_cuda_headers from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch from cuda.core.experimental._memory import _DefaultPinnedMemorySource @@ -152,3 +153,48 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value): # Check result assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}" + + +@skipif_need_cuda_headers # cg +def test_cooperative_launch(): + dev = Device() + dev.set_current() + s = dev.create_stream(options={"nonblocking": True}) + + # CUDA kernel templated on type T + code = r""" + #include + + extern "C" __global__ void test_grid_sync() { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + grid.sync(); + } + """ + + # Compile and force instantiation for this type + arch = "".join(f"{i}" for i in dev.compute_capability) + include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")) + pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=include_path) + prog = Program(code, code_type="c++", options=pro_opts) + ker = prog.compile("cubin").get_kernel("test_grid_sync") + + # # Launch without setting cooperative_launch + # # Commented out as this seems to be a sticky error... + # config = LaunchConfig(grid=1, block=1) + # launch(s, config, ker) + # from cuda.core.experimental._utils.cuda_utils import CUDAError + # with pytest.raises(CUDAError) as e: + # s.sync() + # assert "CUDA_ERROR_LAUNCH_FAILED" in str(e) + + # Crazy grid sizes would not work + block = 128 + config = LaunchConfig(grid=dev.properties.max_grid_dim_x // block + 1, block=block, cooperative_launch=True) + with pytest.raises(ValueError): + launch(s, config, ker) + + # This works just fine + config = LaunchConfig(grid=1, block=1, cooperative_launch=True) + launch(s, config, ker) + s.sync()