diff --git a/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py b/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py index f638f9e74b4..608af19a4f3 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py +++ b/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py @@ -91,10 +91,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -115,10 +112,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -141,10 +135,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -167,7 +158,4 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/bench_reduce.py b/python/cuda_cccl/benchmarks/compute/bench_reduce.py index 46952a29e78..8bf483702a3 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_reduce.py +++ b/python/cuda_cccl/benchmarks/compute/bench_reduce.py @@ -91,10 +91,7 @@ def run(): reduce_pointer(input_array, build_only=(bench_fixture == "compile_benchmark")) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -109,10 +106,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -127,10 +121,7 @@ def run(): reduce_struct(input_array, build_only=(bench_fixture == "compile_benchmark")) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -141,10 +132,7 @@ def run(): reduce_pointer_custom_op(input_array, build_only=False) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) def bench_reduce_pointer_single_phase(benchmark, size): diff --git a/python/cuda_cccl/benchmarks/compute/bench_scan.py b/python/cuda_cccl/benchmarks/compute/bench_scan.py index 6cd6b7a62bf..95f4e892c06 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_scan.py +++ b/python/cuda_cccl/benchmarks/compute/bench_scan.py @@ -108,13 +108,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - if scan_type == "exclusive": - fixture(cuda.compute.make_exclusive_scan, run) - else: - fixture(cuda.compute.make_inclusive_scan, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("scan_type", ["exclusive", "inclusive"]) @@ -145,13 +139,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - if scan_type == "exclusive": - fixture(cuda.compute.make_exclusive_scan, run) - else: - fixture(cuda.compute.make_inclusive_scan, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("scan_type", ["exclusive", "inclusive"]) @@ -171,13 +159,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - if scan_type == "exclusive": - fixture(cuda.compute.make_exclusive_scan, run) - else: - fixture(cuda.compute.make_inclusive_scan, run) - else: - fixture(run) + fixture(run) def scan_pointer_single_phase(input_array, build_only, scan_type): diff --git a/python/cuda_cccl/benchmarks/compute/bench_select.py b/python/cuda_cccl/benchmarks/compute/bench_select.py new file mode 100644 index 00000000000..7f3f38609f6 --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/bench_select.py @@ -0,0 +1,127 @@ +import cupy as cp +import numpy as np +import pytest + +import cuda.compute +from cuda.compute import ( + CacheModifiedInputIterator, + gpu_struct, +) + + +def select_pointer(inp, out, num_selected, build_only): + size = len(inp) + + def even_op(x): + return x % 2 == 0 + + selector = cuda.compute.make_select(inp, out, num_selected, even_op) + if not build_only: + temp_bytes = selector(None, inp, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, inp, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +def select_iterator(size, d_in, out, num_selected, build_only): + d_in_iter = CacheModifiedInputIterator(d_in, modifier="stream") + + def less_than_50(x): + return x < 50 + + selector = cuda.compute.make_select(d_in_iter, out, num_selected, less_than_50) + if not build_only: + temp_bytes = selector(None, d_in_iter, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, d_in_iter, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +@gpu_struct +class Point: + x: np.int32 + y: np.int32 + + +def select_struct(inp, out, num_selected, build_only): + size = len(inp) + + def in_first_quadrant(p: Point) -> np.uint8: + return (p.x > 50) and (p.y > 50) + + selector = cuda.compute.make_select(inp, out, num_selected, in_first_quadrant) + if not build_only: + temp_bytes = selector(None, inp, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, inp, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +def select_stateful(inp, out, num_selected, threshold_state, build_only): + size = len(inp) + + def threshold_select(x): + return x > threshold_state[0] + + selector = cuda.compute.make_select(inp, out, num_selected, threshold_select) + if not build_only: + temp_bytes = selector(None, inp, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, inp, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) +def bench_select_pointer(bench_fixture, request, size): + actual_size = 100 if bench_fixture == "compile_benchmark" else size + inp = cp.random.randint(0, 100, actual_size, dtype=np.int32) + out = cp.empty_like(inp) + num_selected = cp.empty(2, dtype=np.uint64) + + def run(): + select_pointer( + inp, out, num_selected, build_only=(bench_fixture == "compile_benchmark") + ) + + fixture = request.getfixturevalue(bench_fixture) + fixture(run) + + +@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) +def bench_select_iterator(bench_fixture, request, size): + actual_size = 100 if bench_fixture == "compile_benchmark" else size + d_in = cp.random.randint(0, 100, actual_size, dtype=np.int32) + out = cp.empty(actual_size, dtype=np.int32) + num_selected = cp.empty(2, dtype=np.uint64) + + def run(): + select_iterator( + actual_size, + d_in, + out, + num_selected, + build_only=(bench_fixture == "compile_benchmark"), + ) + + fixture = request.getfixturevalue(bench_fixture) + fixture(run) + + +@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) +def bench_select_struct(bench_fixture, request, size): + actual_size = 100 if bench_fixture == "compile_benchmark" else size + inp = cp.random.randint(0, 100, (actual_size, 2), dtype=np.int32).view(Point.dtype) + out = cp.empty_like(inp) + num_selected = cp.empty(2, dtype=np.uint64) + + def run(): + select_struct( + inp, out, num_selected, build_only=(bench_fixture == "compile_benchmark") + ) + + fixture = request.getfixturevalue(bench_fixture) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py b/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py index e88b377f138..a9bef4e27a7 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py +++ b/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py @@ -167,10 +167,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_three_way_partition, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -192,10 +189,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_three_way_partition, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -218,10 +212,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_three_way_partition, run) - else: - fixture(run) + fixture(run) def three_way_partition_pointer_single_phase(inp): diff --git a/python/cuda_cccl/benchmarks/compute/bench_transform.py b/python/cuda_cccl/benchmarks/compute/bench_transform.py index e17044afd00..79c1cce4e77 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_transform.py +++ b/python/cuda_cccl/benchmarks/compute/bench_transform.py @@ -117,10 +117,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_unary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -135,10 +132,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_unary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -154,10 +148,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_unary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -174,10 +165,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -192,10 +180,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -212,10 +197,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -231,7 +213,4 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py b/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py index 41387086a9e..c9e27dcd5ae 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py +++ b/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py @@ -106,10 +106,7 @@ def run(): reduce_zip_array(input_array, build_only=(bench_fixture == "compile_benchmark")) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -122,10 +119,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -138,10 +132,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -163,7 +154,4 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/conftest.py b/python/cuda_cccl/benchmarks/compute/conftest.py index 0e4f9c73829..981b203b7b4 100644 --- a/python/cuda_cccl/benchmarks/compute/conftest.py +++ b/python/cuda_cccl/benchmarks/compute/conftest.py @@ -1,5 +1,7 @@ import pytest +import cuda.compute + @pytest.fixture(params=[True, False]) def build_only(request): @@ -13,11 +15,11 @@ def size(request): @pytest.fixture def compile_benchmark(benchmark): - def run_compile_benchmark(algorithm, function): + def run_compile_benchmark(function): def setup(): # This function is called once before the benchmark runs # to set up the environment. - algorithm.cache_clear() + cuda.compute.clear_all_caches() benchmark.pedantic( function, diff --git a/python/cuda_cccl/cuda/compute/__init__.py b/python/cuda_cccl/cuda/compute/__init__.py index 71f0ad70f4b..854beee17ba 100644 --- a/python/cuda_cccl/cuda/compute/__init__.py +++ b/python/cuda_cccl/cuda/compute/__init__.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from ._caching import clear_all_caches from .algorithms import ( DoubleBuffer, SortOrder, @@ -49,13 +50,13 @@ __all__ = [ "binary_transform", + "clear_all_caches", "CacheModifiedInputIterator", "ConstantIterator", "CountingIterator", "DiscardIterator", "DoubleBuffer", "exclusive_scan", - "select", "gpu_struct", "histogram_even", "inclusive_scan", @@ -81,6 +82,7 @@ "ReverseIterator", "segmented_reduce", "segmented_sort", + "select", "SortOrder", "TransformIterator", "TransformOutputIterator", diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py index 0443f38c0ea..ad675c05341 100644 --- a/python/cuda_cccl/cuda/compute/_caching.py +++ b/python/cuda_cccl/cuda/compute/_caching.py @@ -10,6 +10,9 @@ except ImportError: from cuda.core.experimental import Device +# Central registry of all algorithm caches +_cache_registry: dict[str, object] = {} + def cache_with_key(key): """ @@ -21,6 +24,9 @@ def cache_with_key(key): ----- The CUDA compute capability of the current device is appended to the cache key returned by `key`. + + The decorated function is automatically registered in the central + cache registry for easy cache management. """ def deco(func): @@ -39,11 +45,33 @@ def cache_clear(): cache.clear() inner.cache_clear = cache_clear + + # Register the cache in the central registry + cache_name = func.__qualname__ + _cache_registry[cache_name] = inner + return inner return deco +def clear_all_caches(): + """ + Clear all algorithm caches. + + This function clears all cached algorithm build results, forcing + recompilation on the next invocation. Useful for benchmarking + compilation time. + + Example + ------- + >>> import cuda.compute + >>> cuda.compute.clear_all_caches() + """ + for cached_func in _cache_registry.values(): + cached_func.cache_clear() + + class CachableFunction: """ A type that wraps a function and provides custom comparison