diff --git a/projects/rocprim/.gitlab-ci.yml b/projects/rocprim/.gitlab-ci.yml
index 95b946ae49e..ec650059a29 100644
--- a/projects/rocprim/.gitlab-ci.yml
+++ b/projects/rocprim/.gitlab-ci.yml
@@ -610,6 +610,11 @@ benchmark:
       --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}"
       --benchmark_filter_regex "${BENCHMARK_ALGORITHM_REGEX}"
       --seed "${BENCHMARK_SEED}"
+    - python3
+      .gitlab/report_noise.py
+      --benchmark_json_dir "${BENCHMARK_RESULT_DIR}"
+      --noise_threshold_percentage 1.0
+      --accept_high_noise
   artifacts:
     paths:
       - ${BENCHMARK_RESULT_DIR}
@@ -698,6 +703,11 @@ autotune:execute-tuning:
       --size="${AUTOTUNE_SIZE}"
       --trials="${AUTOTUNE_TRIALS}"
       --seed=82589933
+    - python3
+      .gitlab/report_noise.py
+      --benchmark_json_dir "${AUTOTUNE_RESULT_DIR}"
+      --noise_threshold_percentage 1.0
+      --accept_high_noise
 
 autotune:generate-config:
   image: python:3.10.5-buster
diff --git a/projects/rocprim/.gitlab/report_noise.py b/projects/rocprim/.gitlab/report_noise.py
new file mode 100644
index 00000000000..974c2b128b7
--- /dev/null
+++ b/projects/rocprim/.gitlab/report_noise.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+
+import argparse
+import json
+import os
+import re
+import stat
+import statistics
+import sys
+
+
+class colors:
+    OK = "\033[92m"
+    FAIL = "\033[91m"
+    END_COLOR = "\033[0m"
+
+
+def print_results(results):
+    # Store the length of the longest value in a column
+    longest = {
+        "name": max(len(result["name"]) for result in results),
+        "noisy_permutations": max(
+            len(result["noisy_permutations"]) for result in results
+        ),
+        "mean": max(len(result["mean"]) for result in results),
+        "median": max(len(result["median"]) for result in results),
+        "max": max(len(result["max"]) for result in results),
+        "batch": max(len(result["batch"]) for result in results),
+        "warmup": max(len(result["warmup"]) for result in results),
+        "bytes": max(len(result["bytes"]) for result in results),
+    }
+
+    # The name of a column can be longer than its values
+    longest = {key: max(value, len(key)) for key, value in longest.items()}
+
+    printed = "name".ljust(longest["name"] + 1)
+    printed += "noisy permutations".ljust(longest["noisy_permutations"] + 1)
+    printed += "mean".ljust(longest["mean"] + 1)
+    printed += "median".ljust(longest["median"] + 1)
+    printed += "max".ljust(longest["max"] + 1)
+    printed += "batch".ljust(longest["batch"] + 1)
+    printed += "warmup".ljust(longest["warmup"] + 1)
+    printed += "bytes".ljust(longest["bytes"] + 1)
+    printed += "seed"
+    print(printed)
+
+    for result in results:
+        printed = result["name"].ljust(longest["name"])
+
+        printed += " "
+        printed += colors.FAIL if result["noisy"] else colors.OK
+        printed += (
+            f'{result["noisy_permutations"].ljust(longest["noisy_permutations"])}'
+        )
+        printed += colors.END_COLOR
+
+        printed += " "
+        printed += colors.FAIL if result["bad_mean"] else colors.OK
+        printed += result["mean"].ljust(longest["mean"])
+        printed += colors.END_COLOR
+
+        printed += " "
+        printed += colors.FAIL if result["bad_median"] else colors.OK
+        printed += result["median"].ljust(longest["median"])
+        printed += colors.END_COLOR
+
+        printed += " "
+        printed += colors.FAIL if result["bad_max"] else colors.OK
+        printed += result["max"].ljust(longest["max"])
+        printed += colors.END_COLOR
+
+        printed += " "
+        printed += colors.FAIL if result["bad_batch"] else colors.OK
+        printed += result["batch"].ljust(longest["batch"])
+        printed += colors.END_COLOR
+
+        printed += " "
+        printed += colors.FAIL if result["bad_warmup"] else colors.OK
+        printed += result["warmup"].ljust(longest["warmup"])
+        printed += colors.END_COLOR
+
+        printed += " "
+        printed += colors.FAIL if result["bad_bytes"] else colors.OK
+        printed += result["bytes"].ljust(longest["bytes"])
+        printed += colors.END_COLOR
+
+        printed += " "
+        printed += colors.FAIL if result["seed"] == "random" else colors.OK
+        printed += result["seed"]
+        printed += colors.END_COLOR
+
+        print(printed)
+
+
+def get_results(benchmarks, threshold):
+    def get_humanized_bytes(size):
+        for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
+            if size < 1024.0 or unit == "PiB":
+                break
+            size /= 1024.0
+        return f"{size:.1f} {unit}"
+
+    success = True
+
+    results = []
+
+    for benchmark in benchmarks:
+        data = benchmark["data"]
+
+        name = benchmark["name"]
+
+        permutations = data["benchmarks"]
+
+        cvs = [permutation["cv"] for permutation in permutations]
+
+        # The cv (coefficient of variation) is a standard way of quantifying noise
+        noises = sum(cv * 100 > threshold for cv in cvs)
+        noisy = noises > 0
+
+        if noisy:
+            success = False
+
+        context = data["context"]
+
+        noisy_permutations = f"{noises}/{len(permutations)}"
+
+        mean = statistics.mean(cvs)
+        median = statistics.median(cvs)
+        max_ = max(cvs)
+
+        batch = context["batch_iterations"]
+        warmup = context["warmup_iterations"]
+
+        bytes_ = int(context["size"])
+        seed = context["seed"]
+
+        results.append(
+            {
+                "name": name,
+                "noisy": noisy,
+                "noisy_permutations": noisy_permutations,
+                "bad_mean": mean * 100 > threshold,
+                "mean": f"{mean:.1%}",
+                "bad_median": median * 100 > threshold,
+                "median": f"{median:.1%}",
+                "bad_max": max_ * 100 > threshold,
+                "max": f"{max_:.1%}",
+                "bad_batch": int(batch) < 10,
+                "batch": batch,
+                "bad_warmup": int(warmup) < 5,
+                "warmup": warmup,
+                "bad_bytes": 0 < bytes_ < 128 * 1024 * 1024,  # 128 MiB
+                "bytes": get_humanized_bytes(int(context["size"])),
+                "seed": seed,
+            }
+        )
+
+    return results, success
+
+
+def load_benchmarks(benchmark_json_dir):
+    def is_benchmark_json(filename):
+        if not re.match(r".*\.json$", filename):
+            return False
+        path = os.path.join(benchmark_json_dir, filename)
+        st_mode = os.stat(path).st_mode
+
+        # we are not interested in permissions, just whether it is a regular file (S_IFREG)
+        return st_mode & stat.S_IFREG
+
+    benchmark_names = [
+        name for name in os.listdir(benchmark_json_dir) if is_benchmark_json(name)
+    ]
+
+    success = True
+    benchmarks = []
+    for benchmark_name in benchmark_names:
+        with open(os.path.join(benchmark_json_dir, benchmark_name)) as f:
+            try:
+                benchmarks.append({"name": benchmark_name, "data": json.load(f)})
+            except json.JSONDecodeError as e:
+                print(
+                    f"{colors.FAIL}Failed to load {benchmark_name}{colors.END_COLOR}: {e}\n",
+                    file=sys.stderr,
+                )
+                success = False
+
+    return benchmarks, success
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--noise_threshold_percentage",
+        help="The noise threshold percentage, past which benchmark permutations are considered to be too noisy",
+        required=True,
+        type=float,
+    )
+    parser.add_argument(
+        "--benchmark_json_dir",
+        help="The directory of benchmark JSON files, which to report the noise of",
+        required=True,
+    )
+    parser.add_argument(
+        "--accept_high_noise",
+        help="Don't call exit(1) when there is a noisy benchmark permutation",
+        action=argparse.BooleanOptionalAction,
+    )
+    args = parser.parse_args()
+
+    print(f"The noise threshold is {args.noise_threshold_percentage:.1f}%\n")
+
+    benchmarks, load_success = load_benchmarks(args.benchmark_json_dir)
+    results, results_success = get_results(benchmarks, args.noise_threshold_percentage)
+
+    print_results(results)
+
+    if not load_success:
+        return False
+    if args.accept_high_noise:
+        return True
+    return results_success
+
+
+if __name__ == "__main__":
+    success = main()
+    if success:
+        exit(0)
+    else:
+        exit(1)
diff --git a/projects/rocprim/.gitlab/run_benchmarks.py b/projects/rocprim/.gitlab/run_benchmarks.py
index f406ea473d4..2c0ff81f1f0 100755
--- a/projects/rocprim/.gitlab/run_benchmarks.py
+++ b/projects/rocprim/.gitlab/run_benchmarks.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -51,9 +51,6 @@ def is_benchmark_executable(filename):
         results_json_path = os.path.join(benchmark_context.benchmark_output_dir, results_json_name)
         args = [
             benchmark_path,
-            '--name_format',
-            'json',
-            '--benchmark_out_format=json',
             f'--benchmark_out={results_json_path}',
             f'--benchmark_filter={benchmark_context.benchmark_filter_regex}'
         ]
diff --git a/projects/rocprim/benchmark/benchmark_utils.hpp b/projects/rocprim/benchmark/benchmark_utils.hpp
index 84076fe41e9..98b8341784f 100644
--- a/projects/rocprim/benchmark/benchmark_utils.hpp
+++ b/projects/rocprim/benchmark/benchmark_utils.hpp
@@ -1345,8 +1345,8 @@ class executor
         parser.set_optional<int>("trials", "trials", default_trials, "number of iterations");
         parser.set_optional<std::string>("name_format",
                                          "name_format",
-                                         "human",
-                                         "either: json,human,txt");
+                                         "json",
+                                         "either json, human, or txt");
 
         // Optionally run an evenly split subset of benchmarks for autotuning.
         parser.set_optional<int>("parallel_instance",
diff --git a/projects/rocprim/docs/block_ops/data_mov_funcs.rst b/projects/rocprim/docs/block_ops/data_mov_funcs.rst
index 4fce51dbd54..2aacd1c4216 100644
--- a/projects/rocprim/docs/block_ops/data_mov_funcs.rst
+++ b/projects/rocprim/docs/block_ops/data_mov_funcs.rst
@@ -67,3 +67,18 @@ Store
 
 .. doxygengroup:: blockmodule_warp_store_functions
    :content-only:
+
+Direct Blocked Cast
+====================
+
+Load
+---------
+
+.. doxygengroup:: blockmodule_cast_load_functions
+   :content-only:
+
+Store
+----------
+
+.. doxygengroup:: blockmodule_cast_store_functions
+   :content-only:
diff --git a/projects/rocprim/docs/concepts/spirv.rst b/projects/rocprim/docs/concepts/spirv.rst
new file mode 100644
index 00000000000..ff84b6d026c
--- /dev/null
+++ b/projects/rocprim/docs/concepts/spirv.rst
@@ -0,0 +1,188 @@
+.. meta::
+  :description: rocPRIM documentation and API reference library
+  :keywords: rocPRIM, ROCm, API, documentation
+
+.. _spirv:
+
+******
+SPIR-V
+******
+
+.. warning::
+    This is an **experimental compile target** and may be subject to change without notice.
+    Use at your own risk. Features may be incomplete, unstable, or incompatible with future
+    versions. It is **not recommended for production use**.
+
+rocPRIM has experimental support for AMD GPU target agnostic SPIR-V.
+
+Requirements
+============
+
+The define ``ROCPRIM_EXPERIMENTAL_SPIRV`` must be set to a non-zero value. For example:
+
+.. code:: shell
+
+    hipcc -DROCPRIM_EXPERIMENTAL_SPIRV=1 --offload-arch=amdgcnspirv
+
+.. warning::
+    Setting `ROCPRIM_EXPERIMENTAL_SPIRV` will disable all config dispatching.
+
+.. note::
+    When using CMake, this flag needs to be set in the `CMAKE_CXX_FLAGS`. For example:
+    
+    .. code:: shell
+
+        cmake -DCMAKE_CXX_FLAGS="-DROCPRIM_EXPERIMENTAL_SPIRV=1 --offload-arch=amdgcnspirv"
+
+No other target apart from ``amdgcnspirv`` should be targeted. rocPRIM does not support mixed
+compilation targets.
+
+.. code:: shell
+
+    # !!! This does not work !!!
+    hipcc -DROCPRIM_EXPERIMENTAL_SPIRV=1 --offload-arch=amdgcnspirv --offload-arch=gfx942
+
+Warp- and block-level dispatch
+==============================
+
+When targeting SPIR-V, the hardware wavefront size (also known as warp size) is not known
+at compile time. It is only known during the execution of the host program and thus at the
+runtime compilation of SPIR-V code to the ISA of the actual target, e.g. gfx942 (MI300).
+rocPRIM will add implicit dispatching where it is needed when targeting SPIR-V. This allows
+the same SPIR-V code to work with hardware wavefront sizes of both 32 and 64.
+
+Adding dispatching
+------------------
+
+Adding SPIR-V wavefront dependent dispatching to an existing warp or block level algorithm can 
+be done using infrastructure provided by rocPRIM (see: ``rocprim::arch::wavefront``). This does
+require some extra boiler plate. For example, for an algorithm with some API:
+
+.. code:: cpp
+
+    template<typename T>
+    class my_alg
+    {
+    private:
+        static constexpr unsigned int wave_size = 32;
+    public:
+        __device__ void some_api(T& a, T& b)
+        {
+            some_opaque_impl<wave_size>(a, b);
+        }
+    };
+
+We need to know the current hardware wavefront size. This can be done with
+``rocprim::arch::wavefront::get_target()``. A partial specialization needs to be defined to
+handle the dispatching for dynamic wavefront sizes (either 32 or 64).
+
+.. code:: cpp
+
+    template<
+        typename T, 
+        rocprim::arch::wavefront::target TargetWaveSize = rocprim::arch::wavefront::get_target()
+    >
+    class my_alg
+    {
+    private:
+        constexpr unsigned int wave_size = rocprim::arch::wavefront::size_from_target<TargetWaveSize>();
+    public:
+        __device__ void some_api(T& a, T& b)
+        {
+            some_opaque_impl<wave_size>(a, b);
+        }
+    };
+
+    template<typename T>
+    class my_alg<T, ::rocprim::arch::wavefront::target::dynamic>
+    {
+    private:
+        using impl32 = my_alg<T, ::rocprim::arch::wavefront::target::size32>;
+        using impl64 = my_alg<T, ::rocprim::arch::wavefront::target::size64>;
+    public:
+        __device__ void some_api(T& a, T& b)
+        {
+            if (rocprim::arch::wavefront::size() == 32) {
+                impl32().some_api(a, b);
+            } else {
+                impl64().some_api(a, b);
+            }
+        }
+    };
+
+.. note::
+
+    Developers of rocPRIM can use the the ``rocprim::detail::dispatch_wave_size``. This function will also manage
+    exposed ``storage_type``- types to handle and map shared memory. Variadic templates are used to capture all
+    signatures for a given member function.
+
+    .. warning::
+        ``rocprim::detail::dispatch_wave_size`` is used internally by rocPRIM. Usage by downstream users is not
+        recommended because its behaviour and signature may change at any moment.
+
+    .. code:: cpp
+
+        template<typename T>
+        class my_alg<T, ::rocprim::arch::wavefront::target::dynamic>
+        {
+        private:
+            using impl32 = my_alg<T, ::rocprim::arch::wavefront::target::size32>;
+            using impl64 = my_alg<T, ::rocprim::arch::wavefront::target::size64>;
+            using dispatch = rocprim::detail::dispatch_wave_size<impl32, impl64>;
+        public:
+            template<typename... Args>
+            __device__ void some_api(Args&&... args)
+            {
+                dispatch{}([](auto impl, auto&&... args) { impl.some_api(args...); }, args...);
+            }
+        };
+
+Invoking a dispatchable algorithm
+---------------------------------
+
+When a rocPRIM provided algorithm is dependent on the wavefront size, the dispatcher will handle most of the 
+cases. ``rocprim::arch::wavefront::get_target()`` will resolve to ``target::dynamic`` and be handled via 
+partial specialization. However, there are cases where this is not desired. A downside of the dispatched
+implementation is that more shared memory is allocated than actually used. An algorithm that runs on a device
+with wavefront size 32 may have a different shared memory footprint than one with wavefront size 64. Using the
+dispatcher will then result in suboptimal occupancy.
+
+If it's known that a kernel is only invoked on hardware with a specific wavefront size, then the wavefront size can
+be passed to it.
+
+.. code:: cpp
+
+    using key_type   = int;
+    using value_type = rocprim::empty_type;
+
+    // Will use dispatch internally when targeting SPIR-V.
+    using sort_dispatch = rocprim::warp_sort<
+        key_type,
+        value_type
+    >;
+
+    // Will not use dispatch.
+    // Will only work proper with hardware with wavefront size of 32.
+    // Undefined behaviour on hardware with wavefront size of 64.
+    using sort_wave32   = rocprim::warp_sort<
+        key_type,
+        value_type,
+        rocprim::arch::wavefront::target::wave32
+    >;
+
+    // Will not use dispatch.
+    // Will only work proper with hardware with wavefront size of 64.
+    // Undefined behaviour on hardware with wavefront size of 32.
+    using sort_wave64   = rocprim::warp_sort<
+        key_type, value_type,
+        rocprim::arch::wavefront::target::wave64
+    >;
+
+When compiling for a specific architecture (and thus not SPIR-V), dispatch will not be used by default since
+``rocprim::arch::wavefront::get_target()`` will already resolve to the wavefront size of the architecture.
+
+.. code:: cpp
+
+    #ifdef ROCPRIM_TARGET_CDNA3
+        static_assert(rocprim::arch::wavefront::target::wave64 == rocprim::arch::wavefront::get_target());
+    #endif
diff --git a/projects/rocprim/docs/concepts/tuning.rst b/projects/rocprim/docs/concepts/tuning.rst
index f2b16fb0347..6a5419dd902 100644
--- a/projects/rocprim/docs/concepts/tuning.rst
+++ b/projects/rocprim/docs/concepts/tuning.rst
@@ -16,7 +16,7 @@ What we call *autotuning* is a method of generating the above-mentioned architec
 1. Configure the project for autotuning. Autotune is an extension on top of the regular benchmarking process and it is enabled with a CMake option ``BENCHMARK_CONFIG_TUNING``, which doubles as a C++ macro to determine whether autotuning is enabled.
 2. When the project is configured, a large amount of C++ benchmark files are generated with variation in parameters such as block size, items per thread, and method. The files are generated based on a template (``benchmark/benchmark_*.parallel.cpp.in``) and arguments defined in ``ConfigAutotuneSettings.cmake``. CMake will automatically detect when the input template changes and will reconfigure the required files as necessary.
 3. Compile results in one executable based on all generated files for an algorithm.
-4. Run the executable and gather the JSON output files. The generation of output files is triggered by the use of ``--benchmark_out_format=json --name_format json`` and ``--benchmark_out=<output_file_name>.json`` options when running the executable.
+4. Run the executable and gather the JSON output files. The generation of output files is triggered by the use of ``--benchmark_out=<output_file_name>.json`` when running the executable.
 5. Convert the benchmark results into a config with ``scripts/autotune/create_optimization.py``. This python script injects the optimal configurations into the templates in ``scripts/autotune/templates``.
 
   * The option ``--out_basedir`` can be used to place the output config(s) in a specific path, otherwise the config(s) will be placed in the current directory.
diff --git a/projects/rocprim/docs/doxygen/blockmodule.dox b/projects/rocprim/docs/doxygen/blockmodule.dox
index 9b4b2aa7b6e..2d4a67ebe83 100644
--- a/projects/rocprim/docs/doxygen/blockmodule.dox
+++ b/projects/rocprim/docs/doxygen/blockmodule.dox
@@ -18,4 +18,14 @@
 /**
  * \defgroup blockmodule_warp_store_functions Store Functions
  * \ingroup blockmodule
- */
\ No newline at end of file
+ */
+
+/**
+ * \defgroup blockmodule_cast_load_functions Load Functions
+ * \ingroup blockmodule
+ */
+
+/**
+ * \defgroup blockmodule_cast_store_functions Store Functions
+ * \ingroup blockmodule
+ */
diff --git a/projects/rocprim/docs/index.rst b/projects/rocprim/docs/index.rst
index 7d1015c2c3b..c3486d36019 100644
--- a/projects/rocprim/docs/index.rst
+++ b/projects/rocprim/docs/index.rst
@@ -22,6 +22,7 @@ The rocPRIM documentation is structured as follows:
     * :ref:`glossary`
     * :ref:`tuning`
     * :ref:`developer`
+    * :ref:`spirv`
 
   .. grid-item-card:: API reference
 
diff --git a/projects/rocprim/docs/reference/intrinsics.rst b/projects/rocprim/docs/reference/intrinsics.rst
index 2fab8cb2e56..e8defd31e04 100644
--- a/projects/rocprim/docs/reference/intrinsics.rst
+++ b/projects/rocprim/docs/reference/intrinsics.rst
@@ -16,7 +16,7 @@ Hardware Architecture
 .. doxygenfunction:: rocprim::arch::wavefront::max_size()
 
 .. doxygenenum:: rocprim::arch::wavefront::target
-.. doxygenfunction:: rocprim::arch::wavefront::target()
+.. doxygenfunction:: rocprim::arch::wavefront::get_target()
 .. doxygenfunction:: rocprim::arch::wavefront::size_from_target()
 
 Bitwise
diff --git a/projects/rocprim/docs/sphinx/_toc.yml.in b/projects/rocprim/docs/sphinx/_toc.yml.in
index 23ec331b41d..b2d64a2e6ca 100644
--- a/projects/rocprim/docs/sphinx/_toc.yml.in
+++ b/projects/rocprim/docs/sphinx/_toc.yml.in
@@ -10,6 +10,7 @@ subtrees:
       - file: concepts/glossary.rst
       - file: concepts/tuning.rst
       - file: concepts/developer.rst
+      - file: concepts/spirv.rst
   - file: reference/reference.rst
     title: API Reference
     subtrees:
diff --git a/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp b/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp
index 0fae5677651..bffd8c016a2 100644
--- a/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp
@@ -44,7 +44,7 @@ BEGIN_ROCPRIM_NAMESPACE
 /// \p ItemsPerThread into \p items.
 ///
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -79,7 +79,7 @@ void block_load_direct_blocked(unsigned int flat_id,
 /// \p ItemsPerThread into \p items.
 ///
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -120,7 +120,7 @@ void block_load_direct_blocked(unsigned int flat_id,
 /// \p ItemsPerThread into \p items.
 ///
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -166,7 +166,7 @@ void block_load_direct_blocked(unsigned int flat_id,
 /// block_load_direct_blocked:
 /// * \p ItemsPerThread is odd.
 /// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
-/// int4, etc.
+/// int4, etc.)
 ///
 /// \tparam T [inferred] the input data type
 /// \tparam U [inferred] the output data type
@@ -232,7 +232,7 @@ block_load_direct_blocked_vectorized(unsigned int flat_id,
 ///
 /// \tparam BlockSize the number of threads in a block
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -268,7 +268,7 @@ void block_load_direct_striped(unsigned int flat_id,
 ///
 /// \tparam BlockSize the number of threads in a block
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -311,7 +311,7 @@ void block_load_direct_striped(unsigned int flat_id,
 ///
 /// \tparam BlockSize the number of threads in a block
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -354,7 +354,6 @@ void block_load_direct_striped(unsigned int flat_id,
 /// \p ItemsPerThread into \p items.
 ///
 /// * The number of threads in the block must be a multiple of \p VirtualWaveSize.
-/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value.
 /// * \p VirtualWaveSize must be a power of two and equal or less than the size of
 ///   hardware warp.
 /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower
@@ -362,7 +361,7 @@ void block_load_direct_striped(unsigned int flat_id,
 ///
 /// \tparam VirtualWaveSize [optional] the number of threads in a warp
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -370,10 +369,7 @@ void block_load_direct_striped(unsigned int flat_id,
 /// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
 /// \param block_input the input iterator from the thread block to load from
 /// \param items array that data is loaded to
-template<unsigned int VirtualWaveSize = arch::wavefront::min_size(),
-         class InputIterator,
-         class T,
-         unsigned int ItemsPerThread>
+template<unsigned int VirtualWaveSize, class InputIterator, class T, unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
 void block_load_direct_warp_striped(unsigned int  flat_id,
                                     InputIterator block_input,
@@ -397,6 +393,46 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
     }
 }
 
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block, using the hardware warp size.
+///
+/// \ingroup blockmodule_warp_load_functions
+/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator [inferred] an iterator type for input (can be a simple
+/// pointer)
+/// \tparam T [inferred] the data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input the input iterator from the thread block to load from
+/// \param items array that data is loaded to
+template<class InputIterator, class T, unsigned int ItemsPerThread>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int  flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread])
+{
+    if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size())
+    {
+        block_load_direct_warp_striped<arch::wavefront::min_size()>(flat_id, block_input, items);
+    }
+    else
+    {
+        if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64)
+        {
+            block_load_direct_warp_striped<ROCPRIM_WARP_SIZE_64>(flat_id, block_input, items);
+        }
+        else
+        {
+            block_load_direct_warp_striped<ROCPRIM_WARP_SIZE_32>(flat_id, block_input, items);
+        }
+    }
+}
+
 /// \brief Loads data from continuous memory into a warp-striped arrangement of items
 /// across the thread block, which is guarded by range \p valid.
 ///
@@ -406,7 +442,6 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
 /// \p ItemsPerThread into \p items.
 ///
 /// * The number of threads in the block must be a multiple of \p VirtualWaveSize.
-/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value.
 /// * \p VirtualWaveSize must be a power of two and equal or less than the size of
 ///   hardware warp.
 /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower
@@ -414,7 +449,7 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
 ///
 /// \tparam VirtualWaveSize [optional] the number of threads in a warp
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -423,10 +458,7 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
 /// \param block_input the input iterator from the thread block to load from
 /// \param items array that data is loaded to
 /// \param valid maximum range of valid numbers to load
-template<unsigned int VirtualWaveSize = arch::wavefront::min_size(),
-         class InputIterator,
-         class T,
-         unsigned int ItemsPerThread>
+template<unsigned int VirtualWaveSize, class InputIterator, class T, unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
 void block_load_direct_warp_striped(unsigned int  flat_id,
                                     InputIterator block_input,
@@ -455,6 +487,57 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
     }
 }
 
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block, which is guarded by range \p valid, using the hardware warp size.
+///
+/// \ingroup blockmodule_warp_load_functions
+/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator [inferred] an iterator type for input (can be a simple
+/// pointer)
+/// \tparam T [inferred] the data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input the input iterator from the thread block to load from
+/// \param items array that data is loaded to
+/// \param valid maximum range of valid numbers to load
+template<class InputIterator, class T, unsigned int ItemsPerThread>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int  flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread],
+                                    unsigned int valid)
+{
+    if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size())
+    {
+        block_load_direct_warp_striped<arch::wavefront::min_size()>(flat_id,
+                                                                    block_input,
+                                                                    items,
+                                                                    valid);
+    }
+    else
+    {
+        if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64)
+        {
+            block_load_direct_warp_striped<ROCPRIM_WARP_SIZE_64>(flat_id,
+                                                                 block_input,
+                                                                 items,
+                                                                 valid);
+        }
+        else
+        {
+            block_load_direct_warp_striped<ROCPRIM_WARP_SIZE_32>(flat_id,
+                                                                 block_input,
+                                                                 items,
+                                                                 valid);
+        }
+    }
+}
+
 /// \brief Loads data from continuous memory into a warp-striped arrangement of items
 /// across the thread block, which is guarded by range with a fall-back value
 /// for out-of-bound elements.
@@ -465,7 +548,6 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
 /// \p ItemsPerThread into \p items.
 ///
 /// * The number of threads in the block must be a multiple of \p VirtualWaveSize.
-/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value.
 /// * \p VirtualWaveSize must be a power of two and equal or less than the size of
 ///   hardware warp.
 /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower
@@ -473,7 +555,7 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
 ///
 /// \tparam VirtualWaveSize [optional] the number of threads in a warp
 /// \tparam InputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -484,7 +566,7 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
 /// \param items array that data is loaded to
 /// \param valid maximum range of valid numbers to load
 /// \param out_of_bounds default value assigned to out-of-bound items
-template<unsigned int VirtualWaveSize = arch::wavefront::min_size(),
+template<unsigned int VirtualWaveSize,
          class InputIterator,
          class T,
          unsigned int ItemsPerThread,
@@ -511,18 +593,103 @@ void block_load_direct_warp_striped(unsigned int  flat_id,
     block_load_direct_warp_striped<VirtualWaveSize>(flat_id, block_input, items, valid);
 }
 
-template<class V                             = rocprim::uint128_t,
-         cache_load_modifier LoadType        = load_default,
-         unsigned int        VirtualWaveSize = arch::wavefront::min_size(),
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block, which is guarded by range with a fall-back value
+/// for out-of-bound elements, using the hardware warp size.
+///
+/// \ingroup blockmodule_warp_load_functions
+/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator [inferred] an iterator type for input (can be a simple
+/// pointer)
+/// \tparam T [inferred] the data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+/// \tparam Default [inferred] The data type of the default value
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input the input iterator from the thread block to load from
+/// \param items array that data is loaded to
+/// \param valid maximum range of valid numbers to load
+/// \param out_of_bounds default value assigned to out-of-bound items
+template<class InputIterator, class T, unsigned int ItemsPerThread, class Default>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int  flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread],
+                                    unsigned int valid,
+                                    Default      out_of_bounds)
+{
+    if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size())
+    {
+        block_load_direct_warp_striped<arch::wavefront::min_size()>(flat_id,
+                                                                    block_input,
+                                                                    items,
+                                                                    valid,
+                                                                    out_of_bounds);
+    }
+    else
+    {
+        if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64)
+        {
+            block_load_direct_warp_striped<ROCPRIM_WARP_SIZE_64>(flat_id,
+                                                                 block_input,
+                                                                 items,
+                                                                 valid,
+                                                                 out_of_bounds);
+        }
+        else
+        {
+            block_load_direct_warp_striped<ROCPRIM_WARP_SIZE_32>(flat_id,
+                                                                 block_input,
+                                                                 items,
+                                                                 valid,
+                                                                 out_of_bounds);
+        }
+    }
+}
+
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block.
+///
+/// \ingroup blockmodule_cast_load_functions
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// The following conditions will prevent casting and switch to default
+/// block_store_direct_blocked:
+/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V)
+/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+/// int4, etc.)
+///
+/// \tparam V [optional] the type it will be casted to
+/// \tparam cache_load_modifier [optional] the type thread load used
+/// \tparam VirtualWaveSize [optional] the number of threads in a warp
+/// \tparam T [inferred] the output data type
+/// \tparam U [inferred] the input data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input the input iterator from the thread block to load from
+/// \param items array that data is loaded to
+template<class V                      = rocprim::uint128_t,
+         cache_load_modifier LoadType = load_default,
+         unsigned int        VirtualWaveSize,
          class T,
          class U,
          unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
 auto block_load_direct_blocked_cast(unsigned int flat_id,
                                     T*           block_input,
-                                    U (&items)[ItemsPerThread]) ->
-    typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value
-                            && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type
+                                    U (&items)[ItemsPerThread])
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value
+                               && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 {
     static_assert(detail::is_power_of_two(VirtualWaveSize)
                       && VirtualWaveSize <= arch::wavefront::max_size(),
@@ -542,9 +709,10 @@ auto block_load_direct_blocked_cast(unsigned int flat_id,
     }
 }
 
-template<class V                             = rocprim::uint128_t,
-         cache_load_modifier LoadType        = load_default,
-         unsigned int        VirtualWaveSize = arch::wavefront::min_size(),
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+template<class V                      = rocprim::uint128_t,
+         cache_load_modifier LoadType = load_default,
+         unsigned int        VirtualWaveSize,
          class T,
          class U,
          unsigned int ItemsPerThread>
@@ -557,6 +725,64 @@ auto block_load_direct_blocked_cast(unsigned int flat_id,
 {
     block_load_direct_blocked(flat_id, block_input, items);
 }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block, using hardware warp size.
+///
+/// \ingroup blockmodule_cast_load_functions
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// The following conditions will prevent casting and switch to default
+/// block_store_direct_blocked:
+/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V)
+/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+/// int4, etc.)
+///
+/// \tparam V [optional] the type it will be casted to
+/// \tparam cache_load_modifier [optional] the type thread load used
+/// \tparam T [inferred] the output data type
+/// \tparam U [inferred] the input data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input the input iterator from the thread block to load from
+/// \param items array that data is loaded to
+template<class V                      = rocprim::uint128_t,
+         cache_load_modifier LoadType = load_default,
+         class T,
+         class U,
+         unsigned int ItemsPerThread>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto block_load_direct_blocked_cast(unsigned int flat_id,
+                                    T*           block_input,
+                                    U (&items)[ItemsPerThread])
+{
+    if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size())
+    {
+        block_load_direct_blocked_cast<V, LoadType, arch::wavefront::min_size()>(flat_id,
+                                                                                 block_input,
+                                                                                 items);
+    }
+    else
+    {
+        if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64)
+        {
+            block_load_direct_blocked_cast<V, LoadType, ROCPRIM_WARP_SIZE_64>(flat_id,
+                                                                              block_input,
+                                                                              items);
+        }
+        else
+        {
+            block_load_direct_blocked_cast<V, LoadType, ROCPRIM_WARP_SIZE_32>(flat_id,
+                                                                              block_input,
+                                                                              items);
+        }
+    }
+}
 
 END_ROCPRIM_NAMESPACE
 
diff --git a/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp b/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp
index aad00ab5f4d..060447eb28e 100644
--- a/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp
@@ -44,7 +44,7 @@ BEGIN_ROCPRIM_NAMESPACE
 /// \p ItemsPerThread \p items to the thread block.
 ///
 /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -83,7 +83,7 @@ void block_store_direct_blocked(unsigned int flat_id,
 /// \p ItemsPerThread \p items to the thread block.
 ///
 /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -132,7 +132,7 @@ void block_store_direct_blocked(unsigned int flat_id,
 /// block_load_direct_blocked:
 /// * \p ItemsPerThread is odd.
 /// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
-/// int4, etc.
+/// int4, etc.)
 ///
 /// \tparam T [inferred] the output data type
 /// \tparam U [inferred] the input data type
@@ -197,7 +197,7 @@ block_store_direct_blocked_vectorized(unsigned int flat_id,
 ///
 /// \tparam BlockSize the number of threads in a block
 /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -237,7 +237,7 @@ void block_store_direct_striped(unsigned int flat_id,
 ///
 /// \tparam BlockSize the number of threads in a block
 /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -283,7 +283,6 @@ void block_store_direct_striped(unsigned int flat_id,
 /// \p ItemsPerThread \p items to the thread block.
 ///
 /// * The number of threads in the block must be a multiple of \p VirtualWaveSize.
-/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value.
 /// * \p VirtualWaveSize must be a power of two and equal or less than the size of
 ///   hardware warp.
 /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower
@@ -291,7 +290,7 @@ void block_store_direct_striped(unsigned int flat_id,
 ///
 /// \tparam VirtualWaveSize [optional] the number of threads in a warp
 /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -299,10 +298,7 @@ void block_store_direct_striped(unsigned int flat_id,
 /// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
 /// \param block_output the input iterator from the thread block to store to
 /// \param items array that data is stored to thread block
-template<unsigned int VirtualWaveSize = arch::wavefront::min_size(),
-         class OutputIterator,
-         class T,
-         unsigned int ItemsPerThread>
+template<unsigned int VirtualWaveSize, class OutputIterator, class T, unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
 void block_store_direct_warp_striped(unsigned int   flat_id,
                                      OutputIterator block_output,
@@ -328,6 +324,46 @@ void block_store_direct_warp_striped(unsigned int   flat_id,
     }
 }
 
+/// \brief Stores a warp-striped arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory, using the hardware warp size.
+///
+/// \ingroup blockmodule_warp_store_functions
+/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
+/// pointer)
+/// \tparam T [inferred] the data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output the input iterator from the thread block to store to
+/// \param items array that data is stored to thread block
+template<class OutputIterator, class T, unsigned int ItemsPerThread>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_warp_striped(unsigned int   flat_id,
+                                     OutputIterator block_output,
+                                     T (&items)[ItemsPerThread])
+{
+    if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size())
+    {
+        block_store_direct_warp_striped<arch::wavefront::min_size()>(flat_id, block_output, items);
+    }
+    else
+    {
+        if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64)
+        {
+            block_store_direct_warp_striped<ROCPRIM_WARP_SIZE_64>(flat_id, block_output, items);
+        }
+        else
+        {
+            block_store_direct_warp_striped<ROCPRIM_WARP_SIZE_32>(flat_id, block_output, items);
+        }
+    }
+}
+
 /// \brief Stores a warp-striped arrangement of items from across the thread block
 /// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
 ///
@@ -337,7 +373,6 @@ void block_store_direct_warp_striped(unsigned int   flat_id,
 /// \p ItemsPerThread \p items to the thread block.
 ///
 /// * The number of threads in the block must be a multiple of \p VirtualWaveSize.
-/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value.
 /// * \p VirtualWaveSize must be a power of two and equal or less than the size of
 ///   hardware warp.
 /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower
@@ -345,7 +380,7 @@ void block_store_direct_warp_striped(unsigned int   flat_id,
 ///
 /// \tparam VirtualWaveSize [optional] the number of threads in a warp
 /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
-/// pointer
+/// pointer)
 /// \tparam T [inferred] the data type
 /// \tparam ItemsPerThread [inferred] the number of items to be processed by
 /// each thread
@@ -354,10 +389,7 @@ void block_store_direct_warp_striped(unsigned int   flat_id,
 /// \param block_output the input iterator from the thread block to store to
 /// \param items array that data is stored to thread block
 /// \param valid maximum range of valid numbers to store
-template<unsigned int VirtualWaveSize = arch::wavefront::min_size(),
-         class OutputIterator,
-         class T,
-         unsigned int ItemsPerThread>
+template<unsigned int VirtualWaveSize, class OutputIterator, class T, unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
 void block_store_direct_warp_striped(unsigned int   flat_id,
                                      OutputIterator block_output,
@@ -390,17 +422,95 @@ void block_store_direct_warp_striped(unsigned int   flat_id,
     }
 }
 
-template<class V                      = rocprim::uint128_t,
-         unsigned int VirtualWaveSize = arch::wavefront::min_size(),
+/// \brief Stores a warp-striped arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory, which is guarded by range \p valid,
+/// using the hardware warp size.
+///
+/// \ingroup blockmodule_warp_store_functions
+/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// \tparam OutputIterator [inferred] an iterator type for input (can be a simple
+/// pointer)
+/// \tparam T [inferred] the data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output the input iterator from the thread block to store to
+/// \param items array that data is stored to thread block
+/// \param valid maximum range of valid numbers to store
+template<class OutputIterator, class T, unsigned int ItemsPerThread>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_warp_striped(unsigned int   flat_id,
+                                     OutputIterator block_output,
+                                     T (&items)[ItemsPerThread],
+                                     unsigned int valid)
+{
+    if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size())
+    {
+        block_store_direct_warp_striped<arch::wavefront::min_size()>(flat_id,
+                                                                     block_output,
+                                                                     items,
+                                                                     valid);
+    }
+    else
+    {
+        if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64)
+        {
+            block_store_direct_warp_striped<ROCPRIM_WARP_SIZE_64>(flat_id,
+                                                                  block_output,
+                                                                  items,
+                                                                  valid);
+        }
+        else
+        {
+            block_store_direct_warp_striped<ROCPRIM_WARP_SIZE_32>(flat_id,
+                                                                  block_output,
+                                                                  items,
+                                                                  valid);
+        }
+    }
+}
+
+/// \brief Stores a blocked arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory.
+///
+/// \ingroup blockmodule_cast_store_functions
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// The following conditions will prevent casting and switch to default
+/// block_store_direct_blocked:
+/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V)
+/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+/// int4, etc.)
+///
+/// \tparam V [optional] the type it will be casted to
+/// \tparam VirtualWaveSize [optional] the number of threads in a warp
+/// \tparam T [inferred] the output data type
+/// \tparam U [inferred] the input data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output the input iterator from the thread block to load from
+/// \param items array that data is loaded to
+template<class V = rocprim::uint128_t,
+         unsigned int VirtualWaveSize,
          class T,
          class U,
          unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
 auto block_store_direct_blocked_cast(unsigned int flat_id,
                                      T*           block_output,
-                                     U (&items)[ItemsPerThread]) ->
-    typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value
-                            && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type
+                                     U (&items)[ItemsPerThread])
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value
+                               && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 {
     static_assert(std::is_convertible<U, T>::value,
                   "The type U must be such that it can be implicitly converted to T.");
@@ -416,8 +526,9 @@ auto block_store_direct_blocked_cast(unsigned int flat_id,
     }
 }
 
-template<class V                      = rocprim::uint128_t,
-         unsigned int VirtualWaveSize = arch::wavefront::min_size(),
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+template<class V = rocprim::uint128_t,
+         unsigned int VirtualWaveSize,
          class T,
          class U,
          unsigned int ItemsPerThread>
@@ -430,6 +541,56 @@ auto block_store_direct_blocked_cast(unsigned int flat_id,
 {
     block_store_direct_blocked(flat_id, block_output, items);
 }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/// \brief Stores a blocked arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory, using the hardware warp size.
+///
+/// \ingroup blockmodule_cast_store_functions
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// The following conditions will prevent casting and switch to default
+/// block_load_direct_blocked:
+/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V)
+/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+/// int4, etc.)
+///
+/// \tparam V [optional] the type it will be casted to
+/// \tparam T [inferred] the output data type
+/// \tparam U [inferred] the input data type
+/// \tparam ItemsPerThread [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output the input iterator from the thread block to load from
+/// \param items array that data is loaded to
+template<class V = rocprim::uint128_t, class T, class U, unsigned int ItemsPerThread>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto block_store_direct_blocked_cast(unsigned int flat_id,
+                                     T*           block_output,
+                                     U (&items)[ItemsPerThread])
+{
+
+    if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size())
+    {
+        block_store_direct_blocked_cast<V, arch::wavefront::min_size()>(flat_id,
+                                                                        block_output,
+                                                                        items);
+    }
+    else
+    {
+        if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64)
+        {
+            block_store_direct_blocked_cast<V, ROCPRIM_WARP_SIZE_64>(flat_id, block_output, items);
+        }
+        else
+        {
+            block_store_direct_blocked_cast<V, ROCPRIM_WARP_SIZE_32>(flat_id, block_output, items);
+        }
+    }
+}
 
 END_ROCPRIM_NAMESPACE
 
diff --git a/projects/rocprim/rocprim/include/rocprim/config.hpp b/projects/rocprim/rocprim/include/rocprim/config.hpp
index edb947e879e..b5e4ba04359 100644
--- a/projects/rocprim/rocprim/include/rocprim/config.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/config.hpp
@@ -165,11 +165,11 @@
     #define ROCPRIM_DETAIL_HAS_DPP_WF 1
 #endif
 
-#ifndef ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS
+#if !defined(ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS) && !defined(ROCPRIM_TARGET_SPIRV)
     #define ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS 1
 #endif
 
-#ifndef ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS
+#if !defined(ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS) && !defined(ROCPRIM_TARGET_SPIRV)
     #define ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS 1
 #endif
 
diff --git a/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp b/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp
index 25cd3b31255..40a30f8417d 100644
--- a/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp
@@ -131,7 +131,11 @@ void serial_merge(KeyType*                keys_shared,
         key_b = keys_shared[idx_b];
     }
 
+#if defined(__SPIRV__) // TEMP FIX
+    ROCPRIM_NO_UNROLL
+#else
     ROCPRIM_UNROLL
+#endif
     for(unsigned int i = 0; i < ItemsPerThread; ++i)
     {
         // If we don't have any in b, we always take from a. Then, if we don't
diff --git a/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp b/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp
index 84198f6cbc8..3140da4e11a 100644
--- a/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp
@@ -198,7 +198,7 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void vectorized_copy_bytes(const void
     using vector_type                      = uint4;
     constexpr uint32_t ints_in_vector_type = sizeof(uint4) / sizeof(uint32_t);
 
-    constexpr auto warp_size = ::rocprim::arch::wavefront::min_size();
+    const auto     warp_size = ::rocprim::arch::wavefront::size();
     const auto     rank      = ::rocprim::detail::block_thread_id<0>() % warp_size;
 
     const uint8_t* src = reinterpret_cast<const uint8_t*>(input_buffer) + offset;
@@ -230,9 +230,6 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void vectorized_copy_bytes(const void
         in_ptr += warp_size;
     }
 
-    // This can be outside the while block since 'warp_size % ints_in_vector_type' always is '0'
-    static_assert(warp_size % ints_in_vector_type == 0, "Warp size is not a multiple of 4");
-
     in_ptr                          = aligned.in_begin + rank * sizeof(vector_type);
     const uint32_t  in_offset       = (reinterpret_cast<size_t>(in_ptr) % ints_in_vector_type);
     vector_type*    aligned_out_ptr = aligned.out_begin + rank;
@@ -315,7 +312,7 @@ template<bool IsMemCpy,
 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void
     copy_items(InputIt input_buffer, OutputIt output_buffer, Offset num_items, Offset offset = 0)
 {
-    constexpr auto warp_size = ::rocprim::arch::wavefront::min_size();
+    const auto warp_size = ::rocprim::arch::wavefront::size();
     output_buffer += offset;
     input_buffer += offset;
     for(Offset i = threadIdx.x % warp_size; i < num_items; i += warp_size)
@@ -614,7 +611,7 @@ struct batch_memcpy_impl
         {
             const uint32_t warp_id = rocprim::warp_id();
             const uint32_t warps_per_block
-                = rocprim::flat_block_size() / ::rocprim::arch::wavefront::min_size();
+                = rocprim::flat_block_size() / ::rocprim::arch::wavefront::size();
 
             for(buffer_offset_type buffer_offset = warp_id; buffer_offset < num_wlev_buffers;
                 buffer_offset += warps_per_block)
diff --git a/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp b/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp
index 2cca72df5ff..3475a82705e 100644
--- a/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp
@@ -226,7 +226,7 @@ struct is_sample_vectorizable
 
 template<unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Channels, class Sample>
 ROCPRIM_DEVICE ROCPRIM_INLINE
-    typename std::enable_if<is_sample_vectorizable<ItemsPerThread, Channels, Sample>::value>::type
+typename std::enable_if<is_sample_vectorizable<ItemsPerThread, Channels, Sample>::value>::type
     load_samples(unsigned int flat_id,
                  Sample*      samples,
                  sample_vector<Sample, Channels> (&values)[ItemsPerThread])
@@ -489,6 +489,7 @@ ROCPRIM_DEVICE ROCPRIM_INLINE void
         load_samples<BlockSize>(flat_id, samples, values, valid_count);
     }
 
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < ItemsPerThread; i++)
     {
         for(unsigned int channel = 0; channel < ActiveChannels; channel++)
diff --git a/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp b/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp
index 639a6b55f88..3c04bf60118 100644
--- a/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp
@@ -155,14 +155,16 @@ struct discontinuity_helper
 
 /// Custom warp_exchange class with extra check in scatter_to_striped for out-of-bound accesses.
 template<class T,
-         unsigned int ItemsPerThread,
-         unsigned int WarpSize = ::rocprim::arch::wavefront::min_size()>
+         unsigned int                       ItemsPerThread,
+         unsigned int                       WarpSize = ::rocprim::arch::wavefront::min_size(),
+         ::rocprim::arch::wavefront::target TargetWaveSize
+         = ::rocprim::arch::wavefront::get_target()>
 class custom_warp_exchange
 {
     static_assert(::rocprim::detail::is_power_of_two(WarpSize),
                   "Logical warp size must be a power of two.");
     ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(
-        WarpSize <= ::rocprim::arch::wavefront::min_size(),
+        WarpSize <= ::rocprim::arch::wavefront::size_from_target<TargetWaveSize>(),
         "Logical warp size cannot be larger than physical warp size.");
 
     static constexpr unsigned int warp_items = WarpSize * ItemsPerThread;
@@ -414,10 +416,13 @@ template<typename InputType,
          typename OffsetType,
          typename CountType,
          typename OffsetCountPairType,
-         unsigned int         BlockSize,
-         unsigned int         ItemsPerThread,
-         block_load_method    load_input_method,
-         block_scan_algorithm scan_algorithm>
+         unsigned int                       BlockSize,
+         unsigned int                       ItemsPerThread,
+         block_load_method                  load_input_method,
+         block_scan_algorithm               scan_algorithm,
+         ::rocprim::arch::wavefront::target TargetWaveSize
+         = ::rocprim::arch::wavefront::get_target(),
+         typename Enabled = void>
 class block_helper
 {
 private:
@@ -432,8 +437,8 @@ class block_helper
         = run_length_encode::discontinuity_helper<InputType, equal_op, BlockSize>;
 
     // Warp size.
-    static constexpr unsigned int warp_size
-        = detail::get_min_warp_size(BlockSize, ::rocprim::arch::wavefront::min_size());
+    static constexpr unsigned int warp_size = detail::get_min_warp_size(
+        BlockSize, ::rocprim::arch::wavefront::size_from_target<TargetWaveSize>());
     // Number of warps in block.
     static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
 
@@ -446,9 +451,9 @@ class block_helper
     // warp_exchange primitives that will be used to perform warp-level scatter_to_striped
     // on offsets and counts.
     using warp_exchange_offsets_type
-        = custom_warp_exchange<OffsetType, ItemsPerThread, /*logical_*/ warp_size>;
+        = custom_warp_exchange<OffsetType, ItemsPerThread, /*logical_*/ warp_size, TargetWaveSize>;
     using warp_exchange_counts_type
-        = custom_warp_exchange<CountType, ItemsPerThread, /*logical_*/ warp_size>;
+        = custom_warp_exchange<CountType, ItemsPerThread, /*logical_*/ warp_size, TargetWaveSize>;
     // Helper class for scattering offsets and counts.
     using warp_scatter_type = run_length_encode::scatter_helper<warp_exchange_offsets_type,
                                                                 warp_exchange_counts_type,
@@ -496,10 +501,8 @@ class block_helper
                                      const unsigned int    block_id,
                                      const std::size_t     grid_size,
                                      const std::size_t     size,
-                                     storage_type_&        storage_)
+                                     storage_type&        storage)
     {
-        storage_type& storage = storage_.get();
-
         static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
         const std::size_t             block_offset    = block_id * items_per_block;
 
@@ -758,6 +761,62 @@ class block_helper
     }
 };
 
+template<typename InputType,
+         typename OffsetType,
+         typename CountType,
+         typename OffsetCountPairType,
+         unsigned int         BlockSize,
+         unsigned int         ItemsPerThread,
+         block_load_method    load_input_method,
+         block_scan_algorithm scan_algorithm>
+class block_helper<InputType,
+                   OffsetType,
+                   CountType,
+                   OffsetCountPairType,
+                   BlockSize,
+                   ItemsPerThread,
+                   load_input_method,
+                   scan_algorithm,
+                   ::rocprim::arch::wavefront::target::dynamic>
+{
+private:
+    using block_helper_wave32 = block_helper<InputType,
+                                             OffsetType,
+                                             CountType,
+                                             OffsetCountPairType,
+                                             BlockSize,
+                                             ItemsPerThread,
+                                             load_input_method,
+                                             scan_algorithm,
+                                             ::rocprim::arch::wavefront::target::size32>;
+    using block_helper_wave64 = block_helper<InputType,
+                                             OffsetType,
+                                             CountType,
+                                             OffsetCountPairType,
+                                             BlockSize,
+                                             ItemsPerThread,
+                                             load_input_method,
+                                             scan_algorithm,
+                                             ::rocprim::arch::wavefront::target::size64>;
+
+    using dispatch = detail::dispatch_wave_size<block_helper_wave32, block_helper_wave64>;
+
+public:
+    using storage_type = typename dispatch::storage_type;
+
+    ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH
+    using storage_type_ = detail::raw_storage<storage_type>;
+    ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP
+
+    template<typename... Args>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    auto process_block(Args&&... args)
+    {
+        return dispatch{}([](auto impl, auto&&... args) { return impl.process_block(args...); },
+                          args...);
+    }
+};
+
 template<typename Config,
          typename OffsetCountPairType,
          typename InputIterator,
@@ -815,7 +874,10 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto
                                          load_input_method,
                                          scan_algorithm>;
 
-    ROCPRIM_SHARED_MEMORY typename block_processor::storage_type_ storage;
+    ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH ROCPRIM_SHARED_MEMORY
+    typename detail::raw_storage<typename block_processor::storage_type>
+        storage;
+    ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP
 
     const size_t block_id = flat_block_id<block_size, 1, 1>();
 
@@ -834,7 +896,7 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto
                                         block_id,
                                         grid_size,
                                         size,
-                                        storage);
+                                        storage.get());
     }
     else if(valid_in_last_block > 0)
     {
@@ -845,7 +907,7 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto
                                                                     block_id,
                                                                     grid_size,
                                                                     size,
-                                                                    storage);
+                                                                    storage.get());
         // First thread of last block sets the total number of non-trivial runs found and updates
         // the counts with the last run's length if necessary.
         if(threadIdx.x == 0)
diff --git a/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp b/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp
index c1d08a066df..1813596ffc0 100644
--- a/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp
+++ b/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp
@@ -885,20 +885,24 @@ inline hipError_t merge_inplace(void*             temporary_storage,
                                                                    block_block_size,
                                                                    impl::block_merge_kernel,
                                                                    stream));
+
+    const int min_grid_size = rocprim::min(
+        block_merge_grid_size,
+        static_cast<int>(rocprim::detail::ceiling_div(left_size + right_size, block_block_size)));
+
     if(debug_synchronous)
     {
         std::cout << "block_merge_kernel\n"
-                  << "  grid_size     : " << block_merge_grid_size << "\n"
+                  << "  grid_size     : " << min_grid_size << "\n"
                   << "  block_size    : " << block_block_size << std::endl;
     }
 
     // each of the sub merging problem can be solved within a block
-    impl::block_merge_kernel<<<block_merge_grid_size, block_block_size, 0, stream>>>(
-        data,
-        left_size + right_size,
-        compare_function,
-        work_storage,
-        scratch_storage);
+    impl::block_merge_kernel<<<min_grid_size, block_block_size, 0, stream>>>(data,
+                                                                             left_size + right_size,
+                                                                             compare_function,
+                                                                             work_storage,
+                                                                             scratch_storage);
     ROCPRIM_RETURN_ON_ERROR(hipGetLastError());
     if(debug_synchronous)
     {
diff --git a/projects/rocprim/scripts/autotune-search/main.py b/projects/rocprim/scripts/autotune-search/main.py
index f9e7fb25621..1d7e3f9a276 100755
--- a/projects/rocprim/scripts/autotune-search/main.py
+++ b/projects/rocprim/scripts/autotune-search/main.py
@@ -278,15 +278,12 @@ def sample(xs: List[float]) -> Union[float, int]:
                 bench = subprocess.call(
                     [
                         os.path.join(build_dir, 'benchmark', build_target),
-                        '--name_format',
-                        'json',
                         '--seed',
                         'random',  # Random is better... I think? Otherwise we might overfit.
                         '--size',
                         f'{size}',
                         '--trials',
                         f'{trials}',
-                        '--benchmark_out_format=json',
                         f'--benchmark_out={result_filename}',
                     ],
                     cwd=result_dir,
diff --git a/projects/rocprim/test/rocprim/test_intrinsics.cpp b/projects/rocprim/test/rocprim/test_intrinsics.cpp
index 9b4fddede63..ac54a3998b8 100644
--- a/projects/rocprim/test/rocprim/test_intrinsics.cpp
+++ b/projects/rocprim/test/rocprim/test_intrinsics.cpp
@@ -544,7 +544,7 @@ void masked_bit_count_kernel(unsigned int*             out,
                              const max_lane_mask_type  active_lanes)
 {
     const unsigned int out_index = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int in_index  = out_index / rocprim::arch::wavefront::min_size();
+    const unsigned int in_index  = out_index / rocprim::arch::wavefront::size();
 
     const auto   value  = static_cast<rocprim::lane_mask_type>(in[in_index]);
     unsigned int result = test_type_helper<unsigned int>::uninitialized();
@@ -655,7 +655,7 @@ void warp_any_all_kernel(unsigned int*             out,
 {
     const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int predicate
-        = (in[index / rocprim::arch::wavefront::min_size()] >> rocprim::lane_id()) & 1;
+        = (in[index / rocprim::arch::wavefront::size()] >> rocprim::lane_id()) & 1;
 
     unsigned int result = test_type_helper<unsigned int>::uninitialized();
     if(is_lane_active(active_lanes, rocprim::lane_id()))
@@ -1105,7 +1105,7 @@ void group_elect_kernel(max_lane_mask_type* output,
     const unsigned int input_index = blockIdx.x * blockDim.x + threadIdx.x;
 
     const unsigned int output_index
-        = blockIdx.x * warps_per_block + threadIdx.x / ::rocprim::arch::wavefront::min_size();
+        = blockIdx.x * warps_per_block + threadIdx.x / ::rocprim::arch::wavefront::size();
 
     if(rocprim::group_elect(input[input_index]))
     {
diff --git a/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp b/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp
index eafd7327c12..72e8bb8c7e1 100644
--- a/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp
+++ b/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp
@@ -78,7 +78,6 @@ struct custom_non_default_type
 };
 
 // Custom type used in tests
-// Loops are prevented from being unrolled due to a compiler bug in ROCm 5.2 for device code
 template<class T, size_t N>
 struct custom_test_array_type
 {
@@ -90,17 +89,14 @@ struct custom_test_array_type
     ROCPRIM_HOST_DEVICE inline
         custom_test_array_type()
     {
-#pragma unroll 1
         for(size_t i = 0; i < N; i++)
         {
             values[i] = T(i + 1);
         }
     }
 
-    ROCPRIM_HOST_DEVICE inline
-        custom_test_array_type(T v)
+    ROCPRIM_HOST_DEVICE inline custom_test_array_type(T v)
     {
-#pragma unroll 1
         for(size_t i = 0; i < N; i++)
         {
             values[i] = v;
@@ -111,20 +107,18 @@ struct custom_test_array_type
     ROCPRIM_HOST_DEVICE inline
         custom_test_array_type(const custom_test_array_type<U, N>& other)
     {
-#pragma unroll 1
         for(size_t i = 0; i < N; i++)
         {
             values[i] = other.values[i];
         }
     }
 
-    ROCPRIM_HOST_DEVICE inline
-        ~custom_test_array_type() {}
+    ROCPRIM_HOST_DEVICE inline ~custom_test_array_type() {}
 
-    ROCPRIM_HOST_DEVICE inline
-        custom_test_array_type& operator=(const custom_test_array_type& other)
+    ROCPRIM_HOST_DEVICE
+    inline custom_test_array_type&
+        operator=(const custom_test_array_type& other)
     {
-#pragma unroll 1
         for(size_t i = 0; i < N; i++)
         {
             values[i] = other.values[i];
@@ -132,11 +126,11 @@ struct custom_test_array_type
         return *this;
     }
 
-    ROCPRIM_HOST_DEVICE inline
-        custom_test_array_type operator+(const custom_test_array_type& other) const
+    ROCPRIM_HOST_DEVICE
+    inline custom_test_array_type
+        operator+(const custom_test_array_type& other) const
     {
         custom_test_array_type result;
-#pragma unroll 1
         for(size_t i = 0; i < N; i++)
         {
             result.values[i] = values[i] + other.values[i];
@@ -148,7 +142,6 @@ struct custom_test_array_type
         custom_test_array_type operator-(const custom_test_array_type& other) const
     {
         custom_test_array_type result;
-#pragma unroll 1
         for(size_t i = 0; i < N; i++)
         {
             result.values[i] = values[i] - other.values[i];
@@ -159,7 +152,6 @@ struct custom_test_array_type
     ROCPRIM_HOST_DEVICE inline
         bool operator<(const custom_test_array_type& other) const
     {
-#pragma unroll 1
         for(unsigned int i = 0; i < N; i++)
         {
             if(values[i] < other.values[i])
@@ -174,10 +166,10 @@ struct custom_test_array_type
         return false;
     }
 
-    ROCPRIM_HOST_DEVICE inline
-        bool operator>(const custom_test_array_type& other) const
+    ROCPRIM_HOST_DEVICE
+    inline bool
+        operator>(const custom_test_array_type& other) const
     {
-#pragma unroll 1
         for(unsigned int i = 0; i < N; i++)
         {
             if(values[i] > other.values[i])
@@ -192,10 +184,10 @@ struct custom_test_array_type
         return false;
     }
 
-    ROCPRIM_HOST_DEVICE inline
-        bool operator==(const custom_test_array_type& other) const
+    ROCPRIM_HOST_DEVICE
+    inline bool
+        operator==(const custom_test_array_type& other) const
     {
-#pragma unroll 1
         for(size_t i = 0; i < N; i++)
         {
             if(values[i] != other.values[i])
@@ -206,8 +198,9 @@ struct custom_test_array_type
         return true;
     }
 
-    ROCPRIM_HOST_DEVICE inline
-        bool operator!=(const custom_test_array_type& other) const
+    ROCPRIM_HOST_DEVICE
+    inline bool
+        operator!=(const custom_test_array_type& other) const
     {
         return !(*this == other);
     }