diff --git a/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py b/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py
index f638f9e74b4..608af19a4f3 100644
--- a/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py
+++ b/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py
@@ -91,10 +91,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_merge_sort, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -115,10 +112,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_merge_sort, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -141,10 +135,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_merge_sort, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -167,7 +158,4 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_merge_sort, run)
-    else:
-        fixture(run)
+    fixture(run)
diff --git a/python/cuda_cccl/benchmarks/compute/bench_reduce.py b/python/cuda_cccl/benchmarks/compute/bench_reduce.py
index 46952a29e78..8bf483702a3 100644
--- a/python/cuda_cccl/benchmarks/compute/bench_reduce.py
+++ b/python/cuda_cccl/benchmarks/compute/bench_reduce.py
@@ -91,10 +91,7 @@ def run():
         reduce_pointer(input_array, build_only=(bench_fixture == "compile_benchmark"))
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_reduce_into, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -109,10 +106,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_reduce_into, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -127,10 +121,7 @@ def run():
         reduce_struct(input_array, build_only=(bench_fixture == "compile_benchmark"))
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_reduce_into, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -141,10 +132,7 @@ def run():
         reduce_pointer_custom_op(input_array, build_only=False)
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_reduce_into, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 def bench_reduce_pointer_single_phase(benchmark, size):
diff --git a/python/cuda_cccl/benchmarks/compute/bench_scan.py b/python/cuda_cccl/benchmarks/compute/bench_scan.py
index 6cd6b7a62bf..95f4e892c06 100644
--- a/python/cuda_cccl/benchmarks/compute/bench_scan.py
+++ b/python/cuda_cccl/benchmarks/compute/bench_scan.py
@@ -108,13 +108,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        if scan_type == "exclusive":
-            fixture(cuda.compute.make_exclusive_scan, run)
-        else:
-            fixture(cuda.compute.make_inclusive_scan, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("scan_type", ["exclusive", "inclusive"])
@@ -145,13 +139,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        if scan_type == "exclusive":
-            fixture(cuda.compute.make_exclusive_scan, run)
-        else:
-            fixture(cuda.compute.make_inclusive_scan, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("scan_type", ["exclusive", "inclusive"])
@@ -171,13 +159,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        if scan_type == "exclusive":
-            fixture(cuda.compute.make_exclusive_scan, run)
-        else:
-            fixture(cuda.compute.make_inclusive_scan, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 def scan_pointer_single_phase(input_array, build_only, scan_type):
diff --git a/python/cuda_cccl/benchmarks/compute/bench_select.py b/python/cuda_cccl/benchmarks/compute/bench_select.py
new file mode 100644
index 00000000000..7f3f38609f6
--- /dev/null
+++ b/python/cuda_cccl/benchmarks/compute/bench_select.py
@@ -0,0 +1,127 @@
+import cupy as cp
+import numpy as np
+import pytest
+
+import cuda.compute
+from cuda.compute import (
+    CacheModifiedInputIterator,
+    gpu_struct,
+)
+
+
+def select_pointer(inp, out, num_selected, build_only):
+    size = len(inp)
+
+    def even_op(x):
+        return x % 2 == 0
+
+    selector = cuda.compute.make_select(inp, out, num_selected, even_op)
+    if not build_only:
+        temp_bytes = selector(None, inp, out, num_selected, size)
+        temp_storage = cp.empty(temp_bytes, dtype=np.uint8)
+        selector(temp_storage, inp, out, num_selected, size)
+
+    cp.cuda.runtime.deviceSynchronize()
+
+
+def select_iterator(size, d_in, out, num_selected, build_only):
+    d_in_iter = CacheModifiedInputIterator(d_in, modifier="stream")
+
+    def less_than_50(x):
+        return x < 50
+
+    selector = cuda.compute.make_select(d_in_iter, out, num_selected, less_than_50)
+    if not build_only:
+        temp_bytes = selector(None, d_in_iter, out, num_selected, size)
+        temp_storage = cp.empty(temp_bytes, dtype=np.uint8)
+        selector(temp_storage, d_in_iter, out, num_selected, size)
+
+    cp.cuda.runtime.deviceSynchronize()
+
+
+@gpu_struct
+class Point:
+    x: np.int32
+    y: np.int32
+
+
+def select_struct(inp, out, num_selected, build_only):
+    size = len(inp)
+
+    def in_first_quadrant(p: Point) -> np.uint8:
+        return (p.x > 50) and (p.y > 50)
+
+    selector = cuda.compute.make_select(inp, out, num_selected, in_first_quadrant)
+    if not build_only:
+        temp_bytes = selector(None, inp, out, num_selected, size)
+        temp_storage = cp.empty(temp_bytes, dtype=np.uint8)
+        selector(temp_storage, inp, out, num_selected, size)
+
+    cp.cuda.runtime.deviceSynchronize()
+
+
+def select_stateful(inp, out, num_selected, threshold_state, build_only):
+    size = len(inp)
+
+    def threshold_select(x):
+        return x > threshold_state[0]
+
+    selector = cuda.compute.make_select(inp, out, num_selected, threshold_select)
+    if not build_only:
+        temp_bytes = selector(None, inp, out, num_selected, size)
+        temp_storage = cp.empty(temp_bytes, dtype=np.uint8)
+        selector(temp_storage, inp, out, num_selected, size)
+
+    cp.cuda.runtime.deviceSynchronize()
+
+
+@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
+def bench_select_pointer(bench_fixture, request, size):
+    actual_size = 100 if bench_fixture == "compile_benchmark" else size
+    inp = cp.random.randint(0, 100, actual_size, dtype=np.int32)
+    out = cp.empty_like(inp)
+    num_selected = cp.empty(2, dtype=np.uint64)
+
+    def run():
+        select_pointer(
+            inp, out, num_selected, build_only=(bench_fixture == "compile_benchmark")
+        )
+
+    fixture = request.getfixturevalue(bench_fixture)
+    fixture(run)
+
+
+@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
+def bench_select_iterator(bench_fixture, request, size):
+    actual_size = 100 if bench_fixture == "compile_benchmark" else size
+    d_in = cp.random.randint(0, 100, actual_size, dtype=np.int32)
+    out = cp.empty(actual_size, dtype=np.int32)
+    num_selected = cp.empty(2, dtype=np.uint64)
+
+    def run():
+        select_iterator(
+            actual_size,
+            d_in,
+            out,
+            num_selected,
+            build_only=(bench_fixture == "compile_benchmark"),
+        )
+
+    fixture = request.getfixturevalue(bench_fixture)
+    fixture(run)
+
+
+@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
+def bench_select_struct(bench_fixture, request, size):
+    actual_size = 100 if bench_fixture == "compile_benchmark" else size
+    inp = cp.random.randint(0, 100, (actual_size, 2), dtype=np.int32).view(Point.dtype)
+    out = cp.empty_like(inp)
+    num_selected = cp.empty(2, dtype=np.uint64)
+
+    def run():
+        select_struct(
+            inp, out, num_selected, build_only=(bench_fixture == "compile_benchmark")
+        )
+
+    fixture = request.getfixturevalue(bench_fixture)
+    fixture(run)
diff --git a/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py b/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py
index e88b377f138..a9bef4e27a7 100644
--- a/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py
+++ b/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py
@@ -167,10 +167,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_three_way_partition, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -192,10 +189,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_three_way_partition, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -218,10 +212,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_three_way_partition, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 def three_way_partition_pointer_single_phase(inp):
diff --git a/python/cuda_cccl/benchmarks/compute/bench_transform.py b/python/cuda_cccl/benchmarks/compute/bench_transform.py
index e17044afd00..79c1cce4e77 100644
--- a/python/cuda_cccl/benchmarks/compute/bench_transform.py
+++ b/python/cuda_cccl/benchmarks/compute/bench_transform.py
@@ -117,10 +117,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_unary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -135,10 +132,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_unary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -154,10 +148,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_unary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -174,10 +165,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_binary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -192,10 +180,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_binary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -212,10 +197,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_binary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -231,7 +213,4 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_binary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
diff --git a/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py b/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py
index 41387086a9e..c9e27dcd5ae 100644
--- a/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py
+++ b/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py
@@ -106,10 +106,7 @@ def run():
         reduce_zip_array(input_array, build_only=(bench_fixture == "compile_benchmark"))
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_reduce_into, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -122,10 +119,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_reduce_into, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -138,10 +132,7 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_reduce_into, run)
-    else:
-        fixture(run)
+    fixture(run)
 
 
 @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"])
@@ -163,7 +154,4 @@ def run():
         )
 
     fixture = request.getfixturevalue(bench_fixture)
-    if bench_fixture == "compile_benchmark":
-        fixture(cuda.compute.make_binary_transform, run)
-    else:
-        fixture(run)
+    fixture(run)
diff --git a/python/cuda_cccl/benchmarks/compute/conftest.py b/python/cuda_cccl/benchmarks/compute/conftest.py
index 0e4f9c73829..981b203b7b4 100644
--- a/python/cuda_cccl/benchmarks/compute/conftest.py
+++ b/python/cuda_cccl/benchmarks/compute/conftest.py
@@ -1,5 +1,7 @@
 import pytest
 
+import cuda.compute
+
 
 @pytest.fixture(params=[True, False])
 def build_only(request):
@@ -13,11 +15,11 @@ def size(request):
 
 @pytest.fixture
 def compile_benchmark(benchmark):
-    def run_compile_benchmark(algorithm, function):
+    def run_compile_benchmark(function):
         def setup():
             # This function is called once before the benchmark runs
             # to set up the environment.
-            algorithm.cache_clear()
+            cuda.compute.clear_all_caches()
 
         benchmark.pedantic(
             function,
diff --git a/python/cuda_cccl/cuda/compute/__init__.py b/python/cuda_cccl/cuda/compute/__init__.py
index 71f0ad70f4b..854beee17ba 100644
--- a/python/cuda_cccl/cuda/compute/__init__.py
+++ b/python/cuda_cccl/cuda/compute/__init__.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from ._caching import clear_all_caches
 from .algorithms import (
     DoubleBuffer,
     SortOrder,
@@ -49,13 +50,13 @@
 
 __all__ = [
     "binary_transform",
+    "clear_all_caches",
     "CacheModifiedInputIterator",
     "ConstantIterator",
     "CountingIterator",
     "DiscardIterator",
     "DoubleBuffer",
     "exclusive_scan",
-    "select",
     "gpu_struct",
     "histogram_even",
     "inclusive_scan",
@@ -81,6 +82,7 @@
     "ReverseIterator",
     "segmented_reduce",
     "segmented_sort",
+    "select",
     "SortOrder",
     "TransformIterator",
     "TransformOutputIterator",
diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py
index 0443f38c0ea..ad675c05341 100644
--- a/python/cuda_cccl/cuda/compute/_caching.py
+++ b/python/cuda_cccl/cuda/compute/_caching.py
@@ -10,6 +10,9 @@
 except ImportError:
     from cuda.core.experimental import Device
 
+# Central registry of all algorithm caches
+_cache_registry: dict[str, object] = {}
+
 
 def cache_with_key(key):
     """
@@ -21,6 +24,9 @@ def cache_with_key(key):
     -----
     The CUDA compute capability of the current device is appended to
     the cache key returned by `key`.
+
+    The decorated function is automatically registered in the central
+    cache registry for easy cache management.
     """
 
     def deco(func):
@@ -39,11 +45,33 @@ def cache_clear():
             cache.clear()
 
         inner.cache_clear = cache_clear
+
+        # Register the cache in the central registry
+        cache_name = func.__qualname__
+        _cache_registry[cache_name] = inner
+
         return inner
 
     return deco
 
 
+def clear_all_caches():
+    """
+    Clear all algorithm caches.
+
+    This function clears all cached algorithm build results, forcing
+    recompilation on the next invocation. Useful for benchmarking
+    compilation time.
+
+    Example
+    -------
+    >>> import cuda.compute
+    >>> cuda.compute.clear_all_caches()
+    """
+    for cached_func in _cache_registry.values():
+        cached_func.cache_clear()
+
+
 class CachableFunction:
     """
     A type that wraps a function and provides custom comparison