diff --git a/projects/rocprim/.gitlab-ci.yml b/projects/rocprim/.gitlab-ci.yml index 95b946ae49e..ec650059a29 100644 --- a/projects/rocprim/.gitlab-ci.yml +++ b/projects/rocprim/.gitlab-ci.yml @@ -610,6 +610,11 @@ benchmark: --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}" --benchmark_filter_regex "${BENCHMARK_ALGORITHM_REGEX}" --seed "${BENCHMARK_SEED}" + - python3 + .gitlab/report_noise.py + --benchmark_json_dir "${BENCHMARK_RESULT_DIR}" + --noise_threshold_percentage 1.0 + --accept_high_noise artifacts: paths: - ${BENCHMARK_RESULT_DIR} @@ -698,6 +703,11 @@ autotune:execute-tuning: --size="${AUTOTUNE_SIZE}" --trials="${AUTOTUNE_TRIALS}" --seed=82589933 + - python3 + .gitlab/report_noise.py + --benchmark_json_dir "${AUTOTUNE_RESULT_DIR}" + --noise_threshold_percentage 1.0 + --accept_high_noise autotune:generate-config: image: python:3.10.5-buster diff --git a/projects/rocprim/.gitlab/report_noise.py b/projects/rocprim/.gitlab/report_noise.py new file mode 100644 index 00000000000..974c2b128b7 --- /dev/null +++ b/projects/rocprim/.gitlab/report_noise.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +import argparse +import json +import os +import re +import stat +import statistics +import sys + + +class colors: + OK = "\033[92m" + FAIL = "\033[91m" + END_COLOR = "\033[0m" + + +def print_results(results): + # Store the length of the longest value in a column + longest = { + "name": max(len(result["name"]) for result in results), + "noisy_permutations": max( + len(result["noisy_permutations"]) for result in results + ), + "mean": max(len(result["mean"]) for result in results), + "median": max(len(result["median"]) for result in results), + "max": max(len(result["max"]) for result in results), + "batch": max(len(result["batch"]) for result in results), + "warmup": max(len(result["warmup"]) for result in results), + "bytes": max(len(result["bytes"]) for result in results), + } + + # The name of a column can be longer than its values + longest = {key: max(value, len(key)) for key, value in longest.items()} + + printed = "name".ljust(longest["name"] + 1) + printed += "noisy permutations".ljust(longest["noisy_permutations"] + 1) + printed += "mean".ljust(longest["mean"] + 1) + printed += "median".ljust(longest["median"] + 1) + printed += "max".ljust(longest["max"] + 1) + printed += "batch".ljust(longest["batch"] + 1) + printed += "warmup".ljust(longest["warmup"] + 1) + printed += "bytes".ljust(longest["bytes"] + 1) + printed += "seed" + print(printed) + + for result in results: + printed = result["name"].ljust(longest["name"]) + + printed += " " + printed += colors.FAIL if result["noisy"] else colors.OK + printed += ( + f'{result["noisy_permutations"].ljust(longest["noisy_permutations"])}' + ) + printed += colors.END_COLOR + + printed += " " + printed += colors.FAIL if result["bad_mean"] else colors.OK + printed += result["mean"].ljust(longest["mean"]) + printed += colors.END_COLOR + + printed += " " + printed += colors.FAIL if result["bad_median"] else colors.OK + printed += result["median"].ljust(longest["median"]) + printed += colors.END_COLOR + + printed += " " + printed += colors.FAIL if result["bad_max"] else colors.OK + printed += result["max"].ljust(longest["max"]) + printed += colors.END_COLOR + + printed += " " + printed += colors.FAIL if result["bad_batch"] else colors.OK + printed += result["batch"].ljust(longest["batch"]) + printed += colors.END_COLOR + + printed += " " + printed += colors.FAIL if result["bad_warmup"] else colors.OK + printed += result["warmup"].ljust(longest["warmup"]) + printed += colors.END_COLOR + + printed += " " + printed += colors.FAIL if result["bad_bytes"] else colors.OK + printed += result["bytes"].ljust(longest["bytes"]) + printed += colors.END_COLOR + + printed += " " + printed += colors.FAIL if result["seed"] == "random" else colors.OK + printed += result["seed"] + printed += colors.END_COLOR + + print(printed) + + +def get_results(benchmarks, threshold): + def get_humanized_bytes(size): + for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]: + if size < 1024.0 or unit == "PiB": + break + size /= 1024.0 + return f"{size:.1f} {unit}" + + success = True + + results = [] + + for benchmark in benchmarks: + data = benchmark["data"] + + name = benchmark["name"] + + permutations = data["benchmarks"] + + cvs = [permutation["cv"] for permutation in permutations] + + # The cv (coefficient of variation) is a standard way of quantifying noise + noises = sum(cv * 100 > threshold for cv in cvs) + noisy = noises > 0 + + if noisy: + success = False + + context = data["context"] + + noisy_permutations = f"{noises}/{len(permutations)}" + + mean = statistics.mean(cvs) + median = statistics.median(cvs) + max_ = max(cvs) + + batch = context["batch_iterations"] + warmup = context["warmup_iterations"] + + bytes_ = int(context["size"]) + seed = context["seed"] + + results.append( + { + "name": name, + "noisy": noisy, + "noisy_permutations": noisy_permutations, + "bad_mean": mean * 100 > threshold, + "mean": f"{mean:.1%}", + "bad_median": median * 100 > threshold, + "median": f"{median:.1%}", + "bad_max": max_ * 100 > threshold, + "max": f"{max_:.1%}", + "bad_batch": int(batch) < 10, + "batch": batch, + "bad_warmup": int(warmup) < 5, + "warmup": warmup, + "bad_bytes": 0 < bytes_ < 128 * 1024 * 1024, # 128 MiB + "bytes": get_humanized_bytes(int(context["size"])), + "seed": seed, + } + ) + + return results, success + + +def load_benchmarks(benchmark_json_dir): + def is_benchmark_json(filename): + if not re.match(r".*\.json$", filename): + return False + path = os.path.join(benchmark_json_dir, filename) + st_mode = os.stat(path).st_mode + + # we are not interested in permissions, just whether it is a regular file (S_IFREG) + return st_mode & stat.S_IFREG + + benchmark_names = [ + name for name in os.listdir(benchmark_json_dir) if is_benchmark_json(name) + ] + + success = True + benchmarks = [] + for benchmark_name in benchmark_names: + with open(os.path.join(benchmark_json_dir, benchmark_name)) as f: + try: + benchmarks.append({"name": benchmark_name, "data": json.load(f)}) + except json.JSONDecodeError as e: + print( + f"{colors.FAIL}Failed to load {benchmark_name}{colors.END_COLOR}: {e}\n", + file=sys.stderr, + ) + success = False + + return benchmarks, success + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--noise_threshold_percentage", + help="The noise threshold percentage, past which benchmark permutations are considered to be too noisy", + required=True, + type=float, + ) + parser.add_argument( + "--benchmark_json_dir", + help="The directory of benchmark JSON files, which to report the noise of", + required=True, + ) + parser.add_argument( + "--accept_high_noise", + help="Don't call exit(1) when there is a noisy benchmark permutation", + action=argparse.BooleanOptionalAction, + ) + args = parser.parse_args() + + print(f"The noise threshold is {args.noise_threshold_percentage:.1f}%\n") + + benchmarks, load_success = load_benchmarks(args.benchmark_json_dir) + results, results_success = get_results(benchmarks, args.noise_threshold_percentage) + + print_results(results) + + if not load_success: + return False + if args.accept_high_noise: + return True + return results_success + + +if __name__ == "__main__": + success = main() + if success: + exit(0) + else: + exit(1) diff --git a/projects/rocprim/.gitlab/run_benchmarks.py b/projects/rocprim/.gitlab/run_benchmarks.py index f406ea473d4..2c0ff81f1f0 100755 --- a/projects/rocprim/.gitlab/run_benchmarks.py +++ b/projects/rocprim/.gitlab/run_benchmarks.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -51,9 +51,6 @@ def is_benchmark_executable(filename): results_json_path = os.path.join(benchmark_context.benchmark_output_dir, results_json_name) args = [ benchmark_path, - '--name_format', - 'json', - '--benchmark_out_format=json', f'--benchmark_out={results_json_path}', f'--benchmark_filter={benchmark_context.benchmark_filter_regex}' ] diff --git a/projects/rocprim/CHANGELOG.md b/projects/rocprim/CHANGELOG.md index bffcceb3b8d..bf6adb2e6a2 100644 --- a/projects/rocprim/CHANGELOG.md +++ b/projects/rocprim/CHANGELOG.md @@ -53,6 +53,7 @@ This is a complete list of affected functions and how their default accumulator * `rocprim::deterministic_exclusive_scan` * Previous default: `class AccType = detail::input_type_t>` * Current default: `class AccType = rocprim::accumulator_t>` +* Undeprecated internal `detail::raw_storage`. ### Deprecations diff --git a/projects/rocprim/benchmark/benchmark_utils.hpp b/projects/rocprim/benchmark/benchmark_utils.hpp index 84076fe41e9..98b8341784f 100644 --- a/projects/rocprim/benchmark/benchmark_utils.hpp +++ b/projects/rocprim/benchmark/benchmark_utils.hpp @@ -1345,8 +1345,8 @@ class executor parser.set_optional("trials", "trials", default_trials, "number of iterations"); parser.set_optional("name_format", "name_format", - "human", - "either: json,human,txt"); + "json", + "either json, human, or txt"); // Optionally run an evenly split subset of benchmarks for autotuning. parser.set_optional("parallel_instance", diff --git a/projects/rocprim/common/utils_device_ptr.hpp b/projects/rocprim/common/utils_device_ptr.hpp index e0ed783f43a..b1be71b9b30 100644 --- a/projects/rocprim/common/utils_device_ptr.hpp +++ b/projects/rocprim/common/utils_device_ptr.hpp @@ -294,6 +294,7 @@ class device_ptr = common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size); if(err == hipErrorOutOfMemory) { + (void) hipGetLastError(); // reset internally recorded HIP error return false; } HIP_CHECK(err); diff --git a/projects/rocprim/docs/block_ops/data_mov_funcs.rst b/projects/rocprim/docs/block_ops/data_mov_funcs.rst index 4fce51dbd54..2aacd1c4216 100644 --- a/projects/rocprim/docs/block_ops/data_mov_funcs.rst +++ b/projects/rocprim/docs/block_ops/data_mov_funcs.rst @@ -67,3 +67,18 @@ Store .. doxygengroup:: blockmodule_warp_store_functions :content-only: + +Direct Blocked Cast +==================== + +Load +--------- + +.. doxygengroup:: blockmodule_cast_load_functions + :content-only: + +Store +---------- + +.. doxygengroup:: blockmodule_cast_store_functions + :content-only: diff --git a/projects/rocprim/docs/concepts/spirv.rst b/projects/rocprim/docs/concepts/spirv.rst new file mode 100644 index 00000000000..ff84b6d026c --- /dev/null +++ b/projects/rocprim/docs/concepts/spirv.rst @@ -0,0 +1,188 @@ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _spirv: + +****** +SPIR-V +****** + +.. warning:: + This is an **experimental compile target** and may be subject to change without notice. + Use at your own risk. Features may be incomplete, unstable, or incompatible with future + versions. It is **not recommended for production use**. + +rocPRIM has experimental support for AMD GPU target agnostic SPIR-V. + +Requirements +============ + +The define ``ROCPRIM_EXPERIMENTAL_SPIRV`` must be set to a non-zero value. For example: + +.. code:: shell + + hipcc -DROCPRIM_EXPERIMENTAL_SPIRV=1 --offload-arch=amdgcnspirv + +.. warning:: + Setting `ROCPRIM_EXPERIMENTAL_SPIRV` will disable all config dispatching. + +.. note:: + When using CMake, this flag needs to be set in the `CMAKE_CXX_FLAGS`. For example: + + .. code:: shell + + cmake -DCMAKE_CXX_FLAGS="-DROCPRIM_EXPERIMENTAL_SPIRV=1 --offload-arch=amdgcnspirv" + +No other target apart from ``amdgcnspirv`` should be targeted. rocPRIM does not support mixed +compilation targets. + +.. code:: shell + + # !!! This does not work !!! + hipcc -DROCPRIM_EXPERIMENTAL_SPIRV=1 --offload-arch=amdgcnspirv --offload-arch=gfx942 + +Warp- and block-level dispatch +============================== + +When targeting SPIR-V, the hardware wavefront size (also known as warp size) is not known +at compile time. It is only known during the execution of the host program and thus at the +runtime compilation of SPIR-V code to the ISA of the actual target, e.g. gfx942 (MI300). +rocPRIM will add implicit dispatching where it is needed when targeting SPIR-V. This allows +the same SPIR-V code to work with hardware wavefront sizes of both 32 and 64. + +Adding dispatching +------------------ + +Adding SPIR-V wavefront dependent dispatching to an existing warp or block level algorithm can +be done using infrastructure provided by rocPRIM (see: ``rocprim::arch::wavefront``). This does +require some extra boiler plate. For example, for an algorithm with some API: + +.. code:: cpp + + template + class my_alg + { + private: + static constexpr unsigned int wave_size = 32; + public: + __device__ void some_api(T& a, T& b) + { + some_opaque_impl(a, b); + } + }; + +We need to know the current hardware wavefront size. This can be done with +``rocprim::arch::wavefront::get_target()``. A partial specialization needs to be defined to +handle the dispatching for dynamic wavefront sizes (either 32 or 64). + +.. code:: cpp + + template< + typename T, + rocprim::arch::wavefront::target TargetWaveSize = rocprim::arch::wavefront::get_target() + > + class my_alg + { + private: + constexpr unsigned int wave_size = rocprim::arch::wavefront::size_from_target(); + public: + __device__ void some_api(T& a, T& b) + { + some_opaque_impl(a, b); + } + }; + + template + class my_alg + { + private: + using impl32 = my_alg; + using impl64 = my_alg; + public: + __device__ void some_api(T& a, T& b) + { + if (rocprim::arch::wavefront::size() == 32) { + impl32().some_api(a, b); + } else { + impl64().some_api(a, b); + } + } + }; + +.. note:: + + Developers of rocPRIM can use the the ``rocprim::detail::dispatch_wave_size``. This function will also manage + exposed ``storage_type``- types to handle and map shared memory. Variadic templates are used to capture all + signatures for a given member function. + + .. warning:: + ``rocprim::detail::dispatch_wave_size`` is used internally by rocPRIM. Usage by downstream users is not + recommended because its behaviour and signature may change at any moment. + + .. code:: cpp + + template + class my_alg + { + private: + using impl32 = my_alg; + using impl64 = my_alg; + using dispatch = rocprim::detail::dispatch_wave_size; + public: + template + __device__ void some_api(Args&&... args) + { + dispatch{}([](auto impl, auto&&... args) { impl.some_api(args...); }, args...); + } + }; + +Invoking a dispatchable algorithm +--------------------------------- + +When a rocPRIM provided algorithm is dependent on the wavefront size, the dispatcher will handle most of the +cases. ``rocprim::arch::wavefront::get_target()`` will resolve to ``target::dynamic`` and be handled via +partial specialization. However, there are cases where this is not desired. A downside of the dispatched +implementation is that more shared memory is allocated than actually used. An algorithm that runs on a device +with wavefront size 32 may have a different shared memory footprint than one with wavefront size 64. Using the +dispatcher will then result in suboptimal occupancy. + +If it's known that a kernel is only invoked on hardware with a specific wavefront size, then the wavefront size can +be passed to it. + +.. code:: cpp + + using key_type = int; + using value_type = rocprim::empty_type; + + // Will use dispatch internally when targeting SPIR-V. + using sort_dispatch = rocprim::warp_sort< + key_type, + value_type + >; + + // Will not use dispatch. + // Will only work proper with hardware with wavefront size of 32. + // Undefined behaviour on hardware with wavefront size of 64. + using sort_wave32 = rocprim::warp_sort< + key_type, + value_type, + rocprim::arch::wavefront::target::wave32 + >; + + // Will not use dispatch. + // Will only work proper with hardware with wavefront size of 64. + // Undefined behaviour on hardware with wavefront size of 32. + using sort_wave64 = rocprim::warp_sort< + key_type, value_type, + rocprim::arch::wavefront::target::wave64 + >; + +When compiling for a specific architecture (and thus not SPIR-V), dispatch will not be used by default since +``rocprim::arch::wavefront::get_target()`` will already resolve to the wavefront size of the architecture. + +.. code:: cpp + + #ifdef ROCPRIM_TARGET_CDNA3 + static_assert(rocprim::arch::wavefront::target::wave64 == rocprim::arch::wavefront::get_target()); + #endif diff --git a/projects/rocprim/docs/concepts/tuning.rst b/projects/rocprim/docs/concepts/tuning.rst index f2b16fb0347..6a5419dd902 100644 --- a/projects/rocprim/docs/concepts/tuning.rst +++ b/projects/rocprim/docs/concepts/tuning.rst @@ -16,7 +16,7 @@ What we call *autotuning* is a method of generating the above-mentioned architec 1. Configure the project for autotuning. Autotune is an extension on top of the regular benchmarking process and it is enabled with a CMake option ``BENCHMARK_CONFIG_TUNING``, which doubles as a C++ macro to determine whether autotuning is enabled. 2. When the project is configured, a large amount of C++ benchmark files are generated with variation in parameters such as block size, items per thread, and method. The files are generated based on a template (``benchmark/benchmark_*.parallel.cpp.in``) and arguments defined in ``ConfigAutotuneSettings.cmake``. CMake will automatically detect when the input template changes and will reconfigure the required files as necessary. 3. Compile results in one executable based on all generated files for an algorithm. -4. Run the executable and gather the JSON output files. The generation of output files is triggered by the use of ``--benchmark_out_format=json --name_format json`` and ``--benchmark_out=.json`` options when running the executable. +4. Run the executable and gather the JSON output files. The generation of output files is triggered by the use of ``--benchmark_out=.json`` when running the executable. 5. Convert the benchmark results into a config with ``scripts/autotune/create_optimization.py``. This python script injects the optimal configurations into the templates in ``scripts/autotune/templates``. * The option ``--out_basedir`` can be used to place the output config(s) in a specific path, otherwise the config(s) will be placed in the current directory. diff --git a/projects/rocprim/docs/doxygen/blockmodule.dox b/projects/rocprim/docs/doxygen/blockmodule.dox index 9b4b2aa7b6e..2d4a67ebe83 100644 --- a/projects/rocprim/docs/doxygen/blockmodule.dox +++ b/projects/rocprim/docs/doxygen/blockmodule.dox @@ -18,4 +18,14 @@ /** * \defgroup blockmodule_warp_store_functions Store Functions * \ingroup blockmodule - */ \ No newline at end of file + */ + +/** + * \defgroup blockmodule_cast_load_functions Load Functions + * \ingroup blockmodule + */ + +/** + * \defgroup blockmodule_cast_store_functions Store Functions + * \ingroup blockmodule + */ diff --git a/projects/rocprim/docs/index.rst b/projects/rocprim/docs/index.rst index 7d1015c2c3b..c3486d36019 100644 --- a/projects/rocprim/docs/index.rst +++ b/projects/rocprim/docs/index.rst @@ -22,6 +22,7 @@ The rocPRIM documentation is structured as follows: * :ref:`glossary` * :ref:`tuning` * :ref:`developer` + * :ref:`spirv` .. grid-item-card:: API reference diff --git a/projects/rocprim/docs/reference/intrinsics.rst b/projects/rocprim/docs/reference/intrinsics.rst index 2fab8cb2e56..e8defd31e04 100644 --- a/projects/rocprim/docs/reference/intrinsics.rst +++ b/projects/rocprim/docs/reference/intrinsics.rst @@ -16,7 +16,7 @@ Hardware Architecture .. doxygenfunction:: rocprim::arch::wavefront::max_size() .. doxygenenum:: rocprim::arch::wavefront::target -.. doxygenfunction:: rocprim::arch::wavefront::target() +.. doxygenfunction:: rocprim::arch::wavefront::get_target() .. doxygenfunction:: rocprim::arch::wavefront::size_from_target() Bitwise diff --git a/projects/rocprim/docs/sphinx/_toc.yml.in b/projects/rocprim/docs/sphinx/_toc.yml.in index 23ec331b41d..b2d64a2e6ca 100644 --- a/projects/rocprim/docs/sphinx/_toc.yml.in +++ b/projects/rocprim/docs/sphinx/_toc.yml.in @@ -10,6 +10,7 @@ subtrees: - file: concepts/glossary.rst - file: concepts/tuning.rst - file: concepts/developer.rst + - file: concepts/spirv.rst - file: reference/reference.rst title: API Reference subtrees: diff --git a/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp b/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp index 0fae5677651..bffd8c016a2 100644 --- a/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp +++ b/projects/rocprim/rocprim/include/rocprim/block/block_load_func.hpp @@ -44,7 +44,7 @@ BEGIN_ROCPRIM_NAMESPACE /// \p ItemsPerThread into \p items. /// /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -79,7 +79,7 @@ void block_load_direct_blocked(unsigned int flat_id, /// \p ItemsPerThread into \p items. /// /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -120,7 +120,7 @@ void block_load_direct_blocked(unsigned int flat_id, /// \p ItemsPerThread into \p items. /// /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -166,7 +166,7 @@ void block_load_direct_blocked(unsigned int flat_id, /// block_load_direct_blocked: /// * \p ItemsPerThread is odd. /// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2, -/// int4, etc. +/// int4, etc.) /// /// \tparam T [inferred] the input data type /// \tparam U [inferred] the output data type @@ -232,7 +232,7 @@ block_load_direct_blocked_vectorized(unsigned int flat_id, /// /// \tparam BlockSize the number of threads in a block /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -268,7 +268,7 @@ void block_load_direct_striped(unsigned int flat_id, /// /// \tparam BlockSize the number of threads in a block /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -311,7 +311,7 @@ void block_load_direct_striped(unsigned int flat_id, /// /// \tparam BlockSize the number of threads in a block /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -354,7 +354,6 @@ void block_load_direct_striped(unsigned int flat_id, /// \p ItemsPerThread into \p items. /// /// * The number of threads in the block must be a multiple of \p VirtualWaveSize. -/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value. /// * \p VirtualWaveSize must be a power of two and equal or less than the size of /// hardware warp. /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower @@ -362,7 +361,7 @@ void block_load_direct_striped(unsigned int flat_id, /// /// \tparam VirtualWaveSize [optional] the number of threads in a warp /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -370,10 +369,7 @@ void block_load_direct_striped(unsigned int flat_id, /// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread /// \param block_input the input iterator from the thread block to load from /// \param items array that data is loaded to -template +template ROCPRIM_DEVICE ROCPRIM_INLINE void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, @@ -397,6 +393,46 @@ void block_load_direct_warp_striped(unsigned int flat_id, } } +/// \brief Loads data from continuous memory into a warp-striped arrangement of items +/// across the thread block, using the hardware warp size. +/// +/// \ingroup blockmodule_warp_load_functions +/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items +/// across a thread block. Each thread uses a \p flat_id to load a range of +/// \p ItemsPerThread into \p items. +/// +/// \tparam InputIterator [inferred] an iterator type for input (can be a simple +/// pointer) +/// \tparam T [inferred] the data type +/// \tparam ItemsPerThread [inferred] the number of items to be processed by +/// each thread +/// +/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread +/// \param block_input the input iterator from the thread block to load from +/// \param items array that data is loaded to +template +ROCPRIM_DEVICE ROCPRIM_INLINE +void block_load_direct_warp_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread]) +{ + if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size()) + { + block_load_direct_warp_striped(flat_id, block_input, items); + } + else + { + if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64) + { + block_load_direct_warp_striped(flat_id, block_input, items); + } + else + { + block_load_direct_warp_striped(flat_id, block_input, items); + } + } +} + /// \brief Loads data from continuous memory into a warp-striped arrangement of items /// across the thread block, which is guarded by range \p valid. /// @@ -406,7 +442,6 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// \p ItemsPerThread into \p items. /// /// * The number of threads in the block must be a multiple of \p VirtualWaveSize. -/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value. /// * \p VirtualWaveSize must be a power of two and equal or less than the size of /// hardware warp. /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower @@ -414,7 +449,7 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// /// \tparam VirtualWaveSize [optional] the number of threads in a warp /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -423,10 +458,7 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// \param block_input the input iterator from the thread block to load from /// \param items array that data is loaded to /// \param valid maximum range of valid numbers to load -template +template ROCPRIM_DEVICE ROCPRIM_INLINE void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, @@ -455,6 +487,57 @@ void block_load_direct_warp_striped(unsigned int flat_id, } } +/// \brief Loads data from continuous memory into a warp-striped arrangement of items +/// across the thread block, which is guarded by range \p valid, using the hardware warp size. +/// +/// \ingroup blockmodule_warp_load_functions +/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items +/// across a thread block. Each thread uses a \p flat_id to load a range of +/// \p ItemsPerThread into \p items. +/// +/// \tparam InputIterator [inferred] an iterator type for input (can be a simple +/// pointer) +/// \tparam T [inferred] the data type +/// \tparam ItemsPerThread [inferred] the number of items to be processed by +/// each thread +/// +/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread +/// \param block_input the input iterator from the thread block to load from +/// \param items array that data is loaded to +/// \param valid maximum range of valid numbers to load +template +ROCPRIM_DEVICE ROCPRIM_INLINE +void block_load_direct_warp_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid) +{ + if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size()) + { + block_load_direct_warp_striped(flat_id, + block_input, + items, + valid); + } + else + { + if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64) + { + block_load_direct_warp_striped(flat_id, + block_input, + items, + valid); + } + else + { + block_load_direct_warp_striped(flat_id, + block_input, + items, + valid); + } + } +} + /// \brief Loads data from continuous memory into a warp-striped arrangement of items /// across the thread block, which is guarded by range with a fall-back value /// for out-of-bound elements. @@ -465,7 +548,6 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// \p ItemsPerThread into \p items. /// /// * The number of threads in the block must be a multiple of \p VirtualWaveSize. -/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value. /// * \p VirtualWaveSize must be a power of two and equal or less than the size of /// hardware warp. /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower @@ -473,7 +555,7 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// /// \tparam VirtualWaveSize [optional] the number of threads in a warp /// \tparam InputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -484,7 +566,7 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// \param items array that data is loaded to /// \param valid maximum range of valid numbers to load /// \param out_of_bounds default value assigned to out-of-bound items -template(flat_id, block_input, items, valid); } -template +ROCPRIM_DEVICE ROCPRIM_INLINE +void block_load_direct_warp_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) +{ + if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size()) + { + block_load_direct_warp_striped(flat_id, + block_input, + items, + valid, + out_of_bounds); + } + else + { + if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64) + { + block_load_direct_warp_striped(flat_id, + block_input, + items, + valid, + out_of_bounds); + } + else + { + block_load_direct_warp_striped(flat_id, + block_input, + items, + valid, + out_of_bounds); + } + } +} + +/// \brief Loads data from continuous memory into a blocked arrangement of items +/// across the thread block. +/// +/// \ingroup blockmodule_cast_load_functions +/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items +/// across a thread block. Each thread uses a \p flat_id to load a range of +/// \p ItemsPerThread into \p items. +/// +/// The following conditions will prevent casting and switch to default +/// block_store_direct_blocked: +/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V) +/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2, +/// int4, etc.) +/// +/// \tparam V [optional] the type it will be casted to +/// \tparam cache_load_modifier [optional] the type thread load used +/// \tparam VirtualWaveSize [optional] the number of threads in a warp +/// \tparam T [inferred] the output data type +/// \tparam U [inferred] the input data type +/// \tparam ItemsPerThread [inferred] the number of items to be processed by +/// each thread +/// +/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread +/// \param block_input the input iterator from the thread block to load from +/// \param items array that data is loaded to +template ROCPRIM_DEVICE ROCPRIM_INLINE auto block_load_direct_blocked_cast(unsigned int flat_id, T* block_input, - U (&items)[ItemsPerThread]) -> - typename std::enable_if::value - && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type + U (&items)[ItemsPerThread]) +#ifndef DOXYGEN_SHOULD_SKIP_THIS + -> typename std::enable_if::value + && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type +#endif // DOXYGEN_SHOULD_SKIP_THIS { static_assert(detail::is_power_of_two(VirtualWaveSize) && VirtualWaveSize <= arch::wavefront::max_size(), @@ -542,9 +709,10 @@ auto block_load_direct_blocked_cast(unsigned int flat_id, } } -template @@ -557,6 +725,64 @@ auto block_load_direct_blocked_cast(unsigned int flat_id, { block_load_direct_blocked(flat_id, block_input, items); } +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/// \brief Loads data from continuous memory into a blocked arrangement of items +/// across the thread block, using hardware warp size. +/// +/// \ingroup blockmodule_cast_load_functions +/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items +/// across a thread block. Each thread uses a \p flat_id to load a range of +/// \p ItemsPerThread into \p items. +/// +/// The following conditions will prevent casting and switch to default +/// block_store_direct_blocked: +/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V) +/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2, +/// int4, etc.) +/// +/// \tparam V [optional] the type it will be casted to +/// \tparam cache_load_modifier [optional] the type thread load used +/// \tparam T [inferred] the output data type +/// \tparam U [inferred] the input data type +/// \tparam ItemsPerThread [inferred] the number of items to be processed by +/// each thread +/// +/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread +/// \param block_input the input iterator from the thread block to load from +/// \param items array that data is loaded to +template +ROCPRIM_DEVICE ROCPRIM_INLINE +auto block_load_direct_blocked_cast(unsigned int flat_id, + T* block_input, + U (&items)[ItemsPerThread]) +{ + if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size()) + { + block_load_direct_blocked_cast(flat_id, + block_input, + items); + } + else + { + if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64) + { + block_load_direct_blocked_cast(flat_id, + block_input, + items); + } + else + { + block_load_direct_blocked_cast(flat_id, + block_input, + items); + } + } +} END_ROCPRIM_NAMESPACE diff --git a/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp b/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp index aad00ab5f4d..060447eb28e 100644 --- a/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp +++ b/projects/rocprim/rocprim/include/rocprim/block/block_store_func.hpp @@ -44,7 +44,7 @@ BEGIN_ROCPRIM_NAMESPACE /// \p ItemsPerThread \p items to the thread block. /// /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -83,7 +83,7 @@ void block_store_direct_blocked(unsigned int flat_id, /// \p ItemsPerThread \p items to the thread block. /// /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -132,7 +132,7 @@ void block_store_direct_blocked(unsigned int flat_id, /// block_load_direct_blocked: /// * \p ItemsPerThread is odd. /// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2, -/// int4, etc. +/// int4, etc.) /// /// \tparam T [inferred] the output data type /// \tparam U [inferred] the input data type @@ -197,7 +197,7 @@ block_store_direct_blocked_vectorized(unsigned int flat_id, /// /// \tparam BlockSize the number of threads in a block /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -237,7 +237,7 @@ void block_store_direct_striped(unsigned int flat_id, /// /// \tparam BlockSize the number of threads in a block /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -283,7 +283,6 @@ void block_store_direct_striped(unsigned int flat_id, /// \p ItemsPerThread \p items to the thread block. /// /// * The number of threads in the block must be a multiple of \p VirtualWaveSize. -/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value. /// * \p VirtualWaveSize must be a power of two and equal or less than the size of /// hardware warp. /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower @@ -291,7 +290,7 @@ void block_store_direct_striped(unsigned int flat_id, /// /// \tparam VirtualWaveSize [optional] the number of threads in a warp /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -299,10 +298,7 @@ void block_store_direct_striped(unsigned int flat_id, /// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread /// \param block_output the input iterator from the thread block to store to /// \param items array that data is stored to thread block -template +template ROCPRIM_DEVICE ROCPRIM_INLINE void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, @@ -328,6 +324,46 @@ void block_store_direct_warp_striped(unsigned int flat_id, } } +/// \brief Stores a warp-striped arrangement of items from across the thread block +/// into a blocked arrangement on continuous memory, using the hardware warp size. +/// +/// \ingroup blockmodule_warp_store_functions +/// The warp-striped arrangement is assumed to be (\p VirtualWaveSize * \p ItemsPerThread) items +/// across a thread block. Each thread uses a \p flat_id to store a range of +/// \p ItemsPerThread \p items to the thread block. +/// +/// \tparam OutputIterator [inferred] an iterator type for input (can be a simple +/// pointer) +/// \tparam T [inferred] the data type +/// \tparam ItemsPerThread [inferred] the number of items to be processed by +/// each thread +/// +/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread +/// \param block_output the input iterator from the thread block to store to +/// \param items array that data is stored to thread block +template +ROCPRIM_DEVICE ROCPRIM_INLINE +void block_store_direct_warp_striped(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread]) +{ + if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size()) + { + block_store_direct_warp_striped(flat_id, block_output, items); + } + else + { + if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64) + { + block_store_direct_warp_striped(flat_id, block_output, items); + } + else + { + block_store_direct_warp_striped(flat_id, block_output, items); + } + } +} + /// \brief Stores a warp-striped arrangement of items from across the thread block /// into a blocked arrangement on continuous memory, which is guarded by range \p valid. /// @@ -337,7 +373,6 @@ void block_store_direct_warp_striped(unsigned int flat_id, /// \p ItemsPerThread \p items to the thread block. /// /// * The number of threads in the block must be a multiple of \p VirtualWaveSize. -/// * The default \p VirtualWaveSize is a hardware warpsize and is an optimal value. /// * \p VirtualWaveSize must be a power of two and equal or less than the size of /// hardware warp. /// * Using \p VirtualWaveSize smaller than hardware warpsize could result in lower @@ -345,7 +380,7 @@ void block_store_direct_warp_striped(unsigned int flat_id, /// /// \tparam VirtualWaveSize [optional] the number of threads in a warp /// \tparam OutputIterator [inferred] an iterator type for input (can be a simple -/// pointer +/// pointer) /// \tparam T [inferred] the data type /// \tparam ItemsPerThread [inferred] the number of items to be processed by /// each thread @@ -354,10 +389,7 @@ void block_store_direct_warp_striped(unsigned int flat_id, /// \param block_output the input iterator from the thread block to store to /// \param items array that data is stored to thread block /// \param valid maximum range of valid numbers to store -template +template ROCPRIM_DEVICE ROCPRIM_INLINE void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, @@ -390,17 +422,95 @@ void block_store_direct_warp_striped(unsigned int flat_id, } } -template +ROCPRIM_DEVICE ROCPRIM_INLINE +void block_store_direct_warp_striped(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid) +{ + if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size()) + { + block_store_direct_warp_striped(flat_id, + block_output, + items, + valid); + } + else + { + if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64) + { + block_store_direct_warp_striped(flat_id, + block_output, + items, + valid); + } + else + { + block_store_direct_warp_striped(flat_id, + block_output, + items, + valid); + } + } +} + +/// \brief Stores a blocked arrangement of items from across the thread block +/// into a blocked arrangement on continuous memory. +/// +/// \ingroup blockmodule_cast_store_functions +/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items +/// across a thread block. Each thread uses a \p flat_id to store a range of +/// \p ItemsPerThread \p items to the thread block. +/// +/// The following conditions will prevent casting and switch to default +/// block_store_direct_blocked: +/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V) +/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2, +/// int4, etc.) +/// +/// \tparam V [optional] the type it will be casted to +/// \tparam VirtualWaveSize [optional] the number of threads in a warp +/// \tparam T [inferred] the output data type +/// \tparam U [inferred] the input data type +/// \tparam ItemsPerThread [inferred] the number of items to be processed by +/// each thread +/// +/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread +/// \param block_output the input iterator from the thread block to load from +/// \param items array that data is loaded to +template ROCPRIM_DEVICE ROCPRIM_INLINE auto block_store_direct_blocked_cast(unsigned int flat_id, T* block_output, - U (&items)[ItemsPerThread]) -> - typename std::enable_if::value - && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type + U (&items)[ItemsPerThread]) +#ifndef DOXYGEN_SHOULD_SKIP_THIS + -> typename std::enable_if::value + && (ItemsPerThread * sizeof(T)) % sizeof(V) == 0>::type +#endif // DOXYGEN_SHOULD_SKIP_THIS { static_assert(std::is_convertible::value, "The type U must be such that it can be implicitly converted to T."); @@ -416,8 +526,9 @@ auto block_store_direct_blocked_cast(unsigned int flat_id, } } -template @@ -430,6 +541,56 @@ auto block_store_direct_blocked_cast(unsigned int flat_id, { block_store_direct_blocked(flat_id, block_output, items); } +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/// \brief Stores a blocked arrangement of items from across the thread block +/// into a blocked arrangement on continuous memory, using the hardware warp size. +/// +/// \ingroup blockmodule_cast_store_functions +/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items +/// across a thread block. Each thread uses a \p flat_id to store a range of +/// \p ItemsPerThread \p items to the thread block. +/// +/// The following conditions will prevent casting and switch to default +/// block_load_direct_blocked: +/// * \p ItemsPerThread * sizeof(T) should be a multiple of sizeof(V) +/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2, +/// int4, etc.) +/// +/// \tparam V [optional] the type it will be casted to +/// \tparam T [inferred] the output data type +/// \tparam U [inferred] the input data type +/// \tparam ItemsPerThread [inferred] the number of items to be processed by +/// each thread +/// +/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread +/// \param block_output the input iterator from the thread block to load from +/// \param items array that data is loaded to +template +ROCPRIM_DEVICE ROCPRIM_INLINE +auto block_store_direct_blocked_cast(unsigned int flat_id, + T* block_output, + U (&items)[ItemsPerThread]) +{ + + if constexpr(arch::wavefront::min_size() == arch::wavefront::max_size()) + { + block_store_direct_blocked_cast(flat_id, + block_output, + items); + } + else + { + if(arch::wavefront::size() == ROCPRIM_WARP_SIZE_64) + { + block_store_direct_blocked_cast(flat_id, block_output, items); + } + else + { + block_store_direct_blocked_cast(flat_id, block_output, items); + } + } +} END_ROCPRIM_NAMESPACE diff --git a/projects/rocprim/rocprim/include/rocprim/config.hpp b/projects/rocprim/rocprim/include/rocprim/config.hpp index edb947e879e..f090b4e24ae 100644 --- a/projects/rocprim/rocprim/include/rocprim/config.hpp +++ b/projects/rocprim/rocprim/include/rocprim/config.hpp @@ -27,7 +27,10 @@ #include #include -#include "rocprim_version.hpp" +// This file is generated by CMake. It's safe to ignore this when parsed by clangd. +#ifndef _CLANGD + #include "rocprim_version.hpp" +#endif // Inline namespace (e.g. ROCPRIM_300400_NS where 300400 is the rocPRIM version) is used to // eliminate issues when shared libraries are built with different versions of rocPRIM so they may @@ -165,11 +168,11 @@ #define ROCPRIM_DETAIL_HAS_DPP_WF 1 #endif -#ifndef ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS +#if !defined(ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS) && !defined(ROCPRIM_TARGET_SPIRV) #define ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS 1 #endif -#ifndef ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS +#if !defined(ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS) && !defined(ROCPRIM_TARGET_SPIRV) #define ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS 1 #endif diff --git a/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp b/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp index 25cd3b31255..40a30f8417d 100644 --- a/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp +++ b/projects/rocprim/rocprim/include/rocprim/detail/merge_path.hpp @@ -131,7 +131,11 @@ void serial_merge(KeyType* keys_shared, key_b = keys_shared[idx_b]; } +#if defined(__SPIRV__) // TEMP FIX + ROCPRIM_NO_UNROLL +#else ROCPRIM_UNROLL +#endif for(unsigned int i = 0; i < ItemsPerThread; ++i) { // If we don't have any in b, we always take from a. Then, if we don't diff --git a/projects/rocprim/rocprim/include/rocprim/detail/various.hpp b/projects/rocprim/rocprim/include/rocprim/detail/various.hpp index 40c14a31c42..08754f128a0 100644 --- a/projects/rocprim/rocprim/include/rocprim/detail/various.hpp +++ b/projects/rocprim/rocprim/include/rocprim/detail/various.hpp @@ -190,10 +190,11 @@ struct match_fundamental_type >::type; }; -// A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions +// A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions. +// Due to the reinterpret cast, it may in some cases be technically UB, but the generated code is usually +// more performant than proper code. template -struct [[deprecated("To store non default-constructible types in local memory, use " - "rocprim::uninitialized_array instead")]] raw_storage +struct raw_storage { // Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T using device_word = typename detail::match_fundamental_type::type; diff --git a/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp b/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp index 2eb9c9d111e..294941f01fc 100644 --- a/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp +++ b/projects/rocprim/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp @@ -198,7 +198,7 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void vectorized_copy_bytes(const void using vector_type = uint4; constexpr uint32_t ints_in_vector_type = sizeof(uint4) / sizeof(uint32_t); - constexpr auto warp_size = ::rocprim::arch::wavefront::min_size(); + const auto warp_size = ::rocprim::arch::wavefront::size(); const auto rank = ::rocprim::detail::block_thread_id<0>() % warp_size; const uint8_t* src = reinterpret_cast(input_buffer) + offset; @@ -230,9 +230,6 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE static void vectorized_copy_bytes(const void in_ptr += warp_size; } - // This can be outside the while block since 'warp_size % ints_in_vector_type' always is '0' - static_assert(warp_size % ints_in_vector_type == 0, "Warp size is not a multiple of 4"); - in_ptr = aligned.in_begin + rank * sizeof(vector_type); const uint32_t in_offset = (reinterpret_cast(in_ptr) % ints_in_vector_type); vector_type* aligned_out_ptr = aligned.out_begin + rank; @@ -315,7 +312,7 @@ template +struct AliasType { }; + +template +struct AliasType +{ + using type = typename std::iterator_traits::value_type>::value_type; +}; + +template +struct AliasType +{ + using type = unsigned char; +}; + template struct batch_memcpy_impl { @@ -333,13 +349,8 @@ struct batch_memcpy_impl using output_buffer_type = typename std::iterator_traits::value_type; using buffer_size_type = typename std::iterator_traits::value_type; - using input_type = typename std::iterator_traits::value_type; - - using Alias = - typename std::conditional::value_type>::value_type>::type; + // This type is either unsigned char (if IsMemCpy is true) or the InputBufferItType's value type. + using Alias = typename AliasType::type; // Offset over buffers. using buffer_offset_type = uint32_t; @@ -620,7 +631,7 @@ struct batch_memcpy_impl { const uint32_t warp_id = rocprim::warp_id(); const uint32_t warps_per_block - = rocprim::flat_block_size() / ::rocprim::arch::wavefront::min_size(); + = rocprim::flat_block_size() / ::rocprim::arch::wavefront::size(); for(buffer_offset_type buffer_offset = warp_id; buffer_offset < num_wlev_buffers; buffer_offset += warps_per_block) diff --git a/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp b/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp index 2cca72df5ff..3475a82705e 100644 --- a/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp +++ b/projects/rocprim/rocprim/include/rocprim/device/detail/device_histogram.hpp @@ -226,7 +226,7 @@ struct is_sample_vectorizable template ROCPRIM_DEVICE ROCPRIM_INLINE - typename std::enable_if::value>::type +typename std::enable_if::value>::type load_samples(unsigned int flat_id, Sample* samples, sample_vector (&values)[ItemsPerThread]) @@ -489,6 +489,7 @@ ROCPRIM_DEVICE ROCPRIM_INLINE void load_samples(flat_id, samples, values, valid_count); } + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { for(unsigned int channel = 0; channel < ActiveChannels; channel++) diff --git a/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp b/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp index 639a6b55f88..3c04bf60118 100644 --- a/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp +++ b/projects/rocprim/rocprim/include/rocprim/device/detail/device_run_length_encode.hpp @@ -155,14 +155,16 @@ struct discontinuity_helper /// Custom warp_exchange class with extra check in scatter_to_striped for out-of-bound accesses. template + unsigned int ItemsPerThread, + unsigned int WarpSize = ::rocprim::arch::wavefront::min_size(), + ::rocprim::arch::wavefront::target TargetWaveSize + = ::rocprim::arch::wavefront::get_target()> class custom_warp_exchange { static_assert(::rocprim::detail::is_power_of_two(WarpSize), "Logical warp size must be a power of two."); ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT( - WarpSize <= ::rocprim::arch::wavefront::min_size(), + WarpSize <= ::rocprim::arch::wavefront::size_from_target(), "Logical warp size cannot be larger than physical warp size."); static constexpr unsigned int warp_items = WarpSize * ItemsPerThread; @@ -414,10 +416,13 @@ template + unsigned int BlockSize, + unsigned int ItemsPerThread, + block_load_method load_input_method, + block_scan_algorithm scan_algorithm, + ::rocprim::arch::wavefront::target TargetWaveSize + = ::rocprim::arch::wavefront::get_target(), + typename Enabled = void> class block_helper { private: @@ -432,8 +437,8 @@ class block_helper = run_length_encode::discontinuity_helper; // Warp size. - static constexpr unsigned int warp_size - = detail::get_min_warp_size(BlockSize, ::rocprim::arch::wavefront::min_size()); + static constexpr unsigned int warp_size = detail::get_min_warp_size( + BlockSize, ::rocprim::arch::wavefront::size_from_target()); // Number of warps in block. static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size; @@ -446,9 +451,9 @@ class block_helper // warp_exchange primitives that will be used to perform warp-level scatter_to_striped // on offsets and counts. using warp_exchange_offsets_type - = custom_warp_exchange; + = custom_warp_exchange; using warp_exchange_counts_type - = custom_warp_exchange; + = custom_warp_exchange; // Helper class for scattering offsets and counts. using warp_scatter_type = run_length_encode::scatter_helper +class block_helper +{ +private: + using block_helper_wave32 = block_helper; + using block_helper_wave64 = block_helper; + + using dispatch = detail::dispatch_wave_size; + +public: + using storage_type = typename dispatch::storage_type; + + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH + using storage_type_ = detail::raw_storage; + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP + + template + ROCPRIM_DEVICE ROCPRIM_INLINE + auto process_block(Args&&... args) + { + return dispatch{}([](auto impl, auto&&... args) { return impl.process_block(args...); }, + args...); + } +}; + template; - ROCPRIM_SHARED_MEMORY typename block_processor::storage_type_ storage; + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH ROCPRIM_SHARED_MEMORY + typename detail::raw_storage + storage; + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP const size_t block_id = flat_block_id(); @@ -834,7 +896,7 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto block_id, grid_size, size, - storage); + storage.get()); } else if(valid_in_last_block > 0) { @@ -845,7 +907,7 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto block_id, grid_size, size, - storage); + storage.get()); // First thread of last block sets the total number of non-trivial runs found and updates // the counts with the last run's length if necessary. if(threadIdx.x == 0) diff --git a/projects/rocprim/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/projects/rocprim/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp index 639ac93f726..b27dcc21be5 100644 --- a/projects/rocprim/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp +++ b/projects/rocprim/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp @@ -46,7 +46,7 @@ // Global coherence of prefixes_*_values is ensured by atomic_load/atomic_store that bypass // cache. #ifndef ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES - #if defined(__HIP_DEVICE_COMPILE__) && (defined(__gfx942__) || defined(__gfx950__)) + #if defined(__HIP_DEVICE_COMPILE__) && (defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__)) #define ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES 1 #else #define ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES 0 diff --git a/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp b/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp index c1d08a066df..1813596ffc0 100644 --- a/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp +++ b/projects/rocprim/rocprim/include/rocprim/device/device_merge_inplace.hpp @@ -885,20 +885,24 @@ inline hipError_t merge_inplace(void* temporary_storage, block_block_size, impl::block_merge_kernel, stream)); + + const int min_grid_size = rocprim::min( + block_merge_grid_size, + static_cast(rocprim::detail::ceiling_div(left_size + right_size, block_block_size))); + if(debug_synchronous) { std::cout << "block_merge_kernel\n" - << " grid_size : " << block_merge_grid_size << "\n" + << " grid_size : " << min_grid_size << "\n" << " block_size : " << block_block_size << std::endl; } // each of the sub merging problem can be solved within a block - impl::block_merge_kernel<<>>( - data, - left_size + right_size, - compare_function, - work_storage, - scratch_storage); + impl::block_merge_kernel<<>>(data, + left_size + right_size, + compare_function, + work_storage, + scratch_storage); ROCPRIM_RETURN_ON_ERROR(hipGetLastError()); if(debug_synchronous) { diff --git a/projects/rocprim/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp b/projects/rocprim/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp index ba61bae45db..e3e91b60fd1 100644 --- a/projects/rocprim/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp +++ b/projects/rocprim/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp @@ -218,7 +218,7 @@ class texture_cache_iterator #else texture_type words[multiple]; - #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) + #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__) || defined(__GFX12__) #pragma message "Texture cache iterator is not supported on gfx94x, gfx120x or gfx95x as the texture fetch functions in HIP are not available." ROCPRIM_PRINT_ERROR_ONCE("WARNING: Usage of texture_cache_iterator on gfx94x, gfx120x or gfx95x devices is not supported and will not produce valid results.") #else diff --git a/projects/rocprim/rocprim/include/rocprim/thread/thread_load.hpp b/projects/rocprim/rocprim/include/rocprim/thread/thread_load.hpp index 0c5cfd065ae..7207972caf0 100644 --- a/projects/rocprim/rocprim/include/rocprim/thread/thread_load.hpp +++ b/projects/rocprim/rocprim/include/rocprim/thread/thread_load.hpp @@ -112,9 +112,9 @@ T asm_thread_load(void* ptr) ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_inst, wait_cmd); // clang-format on - #if defined(__gfx942__) || defined(__gfx950__) + #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__) ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc0 nt", "s_waitcnt", ""); - #elif defined(__gfx1200__) || defined(__gfx1201__) + #elif defined(__GFX12__) ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", ""); #else ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "glc slc", "s_waitcnt", ""); diff --git a/projects/rocprim/rocprim/include/rocprim/thread/thread_store.hpp b/projects/rocprim/rocprim/include/rocprim/thread/thread_store.hpp index ec945b7e89c..9821ba4aadf 100644 --- a/projects/rocprim/rocprim/include/rocprim/thread/thread_store.hpp +++ b/projects/rocprim/rocprim/include/rocprim/thread/thread_store.hpp @@ -110,9 +110,9 @@ void asm_thread_store(void* ptr, T val) ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_inst, wait_cmd); // clang-format on - #if defined(__gfx942__) || defined(__gfx950__) + #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__) ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 nt", "s_waitcnt", ""); - #elif defined(__gfx1200__) || defined(__gfx1201__) + #elif defined(__GFX12__) ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_storecnt_dscnt", diff --git a/projects/rocprim/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp b/projects/rocprim/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp index b5610886294..c5eb21bd249 100644 --- a/projects/rocprim/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp +++ b/projects/rocprim/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp @@ -53,48 +53,48 @@ class warp_scan_dpp if(VirtualWaveSize > 1) { - T t = scan_op(warp_move_dpp(output), output); // row_shr:1 + T t = warp_move_dpp(output); // row_shr:1 if(row_lane_id >= 1) - output = t; + output = scan_op(t, output); } if(VirtualWaveSize > 2) { - T t = scan_op(warp_move_dpp(output), output); // row_shr:2 + T t = warp_move_dpp(output); // row_shr:2 if(row_lane_id >= 2) - output = t; + output = scan_op(t, output); } if(VirtualWaveSize > 4) { - T t = scan_op(warp_move_dpp(output), output); // row_shr:4 + T t = warp_move_dpp(output); // row_shr:4 if(row_lane_id >= 4) - output = t; + output = scan_op(t, output); } if(VirtualWaveSize > 8) { - T t = scan_op(warp_move_dpp(output), output); // row_shr:8 + T t = warp_move_dpp(output); // row_shr:8 if(row_lane_id >= 8) - output = t; + output = scan_op(t, output); } #ifdef ROCPRIM_DETAIL_HAS_DPP_BROADCAST if(VirtualWaveSize > 16) { - T t = scan_op(warp_move_dpp(output), output); // row_bcast:15 + T t = warp_move_dpp(output); // row_bcast:15 if(lane_id % 32 >= 16) - output = t; + output = scan_op(t, output); } if(VirtualWaveSize > 32) { - T t = scan_op(warp_move_dpp(output), output); // row_bcast:31 + T t = warp_move_dpp(output); // row_bcast:31 if(lane_id >= 32) - output = t; + output = scan_op(t, output); } static_assert(VirtualWaveSize <= 64, "VirtualWaveSize > 64 is not supported"); #else if(VirtualWaveSize > 16) { - T t = scan_op(warp_swizzle(output), output); // row_bcast:15 + T t = warp_swizzle(output); // row_bcast:15 if(lane_id % 32 >= 16) - output = t; + output = scan_op(t, output); } static_assert(VirtualWaveSize <= 32, "VirtualWaveSize > 32 is not supported without DPP broadcasts"); diff --git a/projects/rocprim/scripts/autotune-search/main.py b/projects/rocprim/scripts/autotune-search/main.py index f9e7fb25621..1d7e3f9a276 100755 --- a/projects/rocprim/scripts/autotune-search/main.py +++ b/projects/rocprim/scripts/autotune-search/main.py @@ -278,15 +278,12 @@ def sample(xs: List[float]) -> Union[float, int]: bench = subprocess.call( [ os.path.join(build_dir, 'benchmark', build_target), - '--name_format', - 'json', '--seed', 'random', # Random is better... I think? Otherwise we might overfit. '--size', f'{size}', '--trials', f'{trials}', - '--benchmark_out_format=json', f'--benchmark_out={result_filename}', ], cwd=result_dir, diff --git a/projects/rocprim/test/common_test_header.hpp b/projects/rocprim/test/common_test_header.hpp index fe845c4e256..554160ede6a 100755 --- a/projects/rocprim/test/common_test_header.hpp +++ b/projects/rocprim/test/common_test_header.hpp @@ -50,6 +50,7 @@ hipError_t error = condition; \ if(error == hipErrorOutOfMemory) \ { \ + (void) hipGetLastError(); \ std::cout << "Out of memory. Skipping size = " << size << std::endl; \ break; \ } \ diff --git a/projects/rocprim/test/rocprim/test_device_merge.cpp b/projects/rocprim/test/rocprim/test_device_merge.cpp index a51560bd802..fc07d9ca5bd 100644 --- a/projects/rocprim/test/rocprim/test_device_merge.cpp +++ b/projects/rocprim/test/rocprim/test_device_merge.cpp @@ -193,9 +193,21 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) test_utils::out_of_bounds_flag out_of_bounds; - common::device_ptr d_keys_input1(keys_input1); - common::device_ptr d_keys_input2(keys_input2); - common::device_ptr d_keys_output(keys_output.size()); + common::device_ptr d_keys_input1; + common::device_ptr d_keys_input2; + common::device_ptr d_keys_output; + + if(!d_keys_input1.resize_with_memory_check(keys_input1.size()) + || !d_keys_input2.resize_with_memory_check(keys_input2.size()) + || !d_keys_output.resize_with_memory_check(keys_output.size())) + { + std::cout << "Out of memory. Skipping test with sizes = {" << size1 << ", " << size2 + << "}" << std::endl; + break; + } + + d_keys_input1.store(keys_input1); + d_keys_input2.store(keys_input2); test_utils::bounds_checking_iterator d_keys_checking_output( d_keys_output.get(), @@ -219,7 +231,14 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test with sizes = {" << size1 << ", " << size2 + << "}" << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -354,12 +373,29 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) test_utils::out_of_bounds_flag out_of_bounds; - common::device_ptr d_keys_input1(keys_input1); - common::device_ptr d_keys_input2(keys_input2); - common::device_ptr d_keys_output(keys_output.size()); - common::device_ptr d_values_input1(values_input1); - common::device_ptr d_values_input2(values_input2); - common::device_ptr d_values_output(values_output.size()); + common::device_ptr d_keys_input1; + common::device_ptr d_keys_input2; + common::device_ptr d_keys_output; + common::device_ptr d_values_input1; + common::device_ptr d_values_input2; + common::device_ptr d_values_output; + + if(!d_keys_input1.resize_with_memory_check(keys_input1.size()) + || !d_keys_input2.resize_with_memory_check(keys_input2.size()) + || !d_keys_output.resize_with_memory_check(keys_output.size()) + || !d_values_input1.resize_with_memory_check(values_input1.size()) + || !d_values_input2.resize_with_memory_check(values_input1.size()) + || !d_values_output.resize_with_memory_check(values_output.size())) + { + std::cout << "Out of memory. Skipping test with sizes = {" << size1 << ", " << size2 + << "}" << std::endl; + break; + } + + d_keys_input1.store(keys_input1); + d_keys_input2.store(keys_input2); + d_values_input1.store(values_input1); + d_values_input2.store(values_input2); test_utils::bounds_checking_iterator d_keys_checking_output( d_keys_output.get(), @@ -390,7 +426,14 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test with sizes = {" << size1 << ", " << size2 + << "}" << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) diff --git a/projects/rocprim/test/rocprim/test_device_merge_sort.cpp b/projects/rocprim/test/rocprim/test_device_merge_sort.cpp index 4b315d4a3be..bf5792b954c 100644 --- a/projects/rocprim/test/rocprim/test_device_merge_sort.cpp +++ b/projects/rocprim/test/rocprim/test_device_merge_sort.cpp @@ -148,9 +148,17 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) 100, seed_value); // float16 can't exceed 65504 - common::device_ptr d_input(input); + common::device_ptr d_input; common::device_ptr d_output_alloc; - d_output_alloc.resize(in_place ? 0 : size); + + if(!d_input.resize_with_memory_check(size) + || !d_output_alloc.resize_with_memory_check(in_place ? 0 : size)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); common::device_ptr& d_output = in_place ? d_input : d_output_alloc; // compare function @@ -175,7 +183,13 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -258,15 +272,25 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) std::vector values_input(size); test_utils::iota(values_input.begin(), values_input.end(), 0); - common::device_ptr d_keys_input(keys_input); - common::device_ptr d_keys_output_alloc; - d_keys_output_alloc.resize(in_place ? 0 : size); + common::device_ptr d_keys_input; + common::device_ptr d_keys_output_alloc; + common::device_ptr d_values_input; + common::device_ptr d_values_output_alloc; + + if(!d_keys_input.resize_with_memory_check(size) + || !d_keys_output_alloc.resize_with_memory_check(in_place ? 0 : size) + || !d_values_input.resize_with_memory_check(size) + || !d_values_output_alloc.resize_with_memory_check(in_place ? 0 : size)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_keys_input.store(keys_input); + d_values_input.store(values_input); + common::device_ptr& d_keys_output = in_place ? d_keys_input : d_keys_output_alloc; - - common::device_ptr d_values_input(values_input); - common::device_ptr d_values_output_alloc; - d_values_output_alloc.resize(in_place ? 0 : size); common::device_ptr& d_values_output = in_place ? d_values_input : d_values_output_alloc; @@ -302,7 +326,13 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -390,6 +420,7 @@ void testLargeIndices() hipError_t malloc_status = common::hipMallocHelper(&d_output, size * sizeof(*d_output)); if(malloc_status == hipErrorOutOfMemory) { + (void) hipGetLastError(); // reset internally recorded HIP error std::cout << "Out of memory. Skipping size = " << size << std::endl; break; } @@ -418,6 +449,7 @@ void testLargeIndices() malloc_status = common::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes); if(malloc_status == hipErrorOutOfMemory) { + (void) hipGetLastError(); // reset internally recorded HIP error std::cout << "Out of memory. Skipping size = " << size << std::endl; HIP_CHECK(hipFree(d_output)); break; diff --git a/projects/rocprim/test/rocprim/test_device_partition.cpp b/projects/rocprim/test/rocprim/test_device_partition.cpp index 554f80c46f3..fff00f960fa 100644 --- a/projects/rocprim/test/rocprim/test_device_partition.cpp +++ b/projects/rocprim/test/rocprim/test_device_partition.cpp @@ -142,10 +142,21 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) std::vector input = test_utils::get_random_data_wrapped(size, 1, 100, seed_value); std::vector flags = test_utils::get_random_data01(size, 0.25, seed_value); - common::device_ptr d_input(input); - common::device_ptr d_flags(flags); - common::device_ptr d_output(input.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_input; + common::device_ptr d_flags; + common::device_ptr d_output; + common::device_ptr d_selected_count_output; + + if(!d_input.resize_with_memory_check(size) || !d_flags.resize_with_memory_check(size) + || !d_output.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); + d_flags.store(flags); // Calculate expected_selected and expected_rejected results on host std::vector expected_selected; @@ -184,7 +195,13 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -366,9 +383,18 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) // Generate data std::vector input = test_utils::get_random_data_wrapped(size, 1, 100, seed_value); - common::device_ptr d_input(input); - common::device_ptr d_output(input.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_input; + common::device_ptr d_output; + common::device_ptr d_selected_count_output; + + if(!d_input.resize_with_memory_check(size) || !d_output.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); // Calculate expected_selected and expected_rejected results on host std::vector expected_selected; @@ -406,7 +432,13 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -496,10 +528,20 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateTwoWay) // Generate data std::vector input = test_utils::get_random_data_wrapped(size, 1, 100, seed_value); - common::device_ptr d_input(input); - common::device_ptr d_selected(input.size()); - common::device_ptr d_rejected(input.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_input; + common::device_ptr d_selected; + common::device_ptr d_rejected; + common::device_ptr d_selected_count_output; + + if(!d_input.resize_with_memory_check(size) || !d_selected.resize_with_memory_check(size) + || !d_rejected.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); // Calculate expected_selected and expected_rejected results on host std::vector expected_selected; @@ -537,7 +579,13 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateTwoWay) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -645,11 +693,23 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateThreeWay) // Generate data const auto input = test_utils::get_random_data_wrapped(size, 1, 100, seed_value); - common::device_ptr d_input(input); - common::device_ptr d_first_output(input.size()); - common::device_ptr d_second_output(input.size()); - common::device_ptr d_unselected_output(input.size()); - common::device_ptr d_selected_counts(2); + common::device_ptr d_input; + common::device_ptr d_first_output; + common::device_ptr d_second_output; + common::device_ptr d_unselected_output; + common::device_ptr d_selected_counts; + + if(!d_input.resize_with_memory_check(size) + || !d_first_output.resize_with_memory_check(size) + || !d_second_output.resize_with_memory_check(size) + || !d_unselected_output.resize_with_memory_check(size) + || !d_selected_counts.resize_with_memory_check(2)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); const auto first_op = LessOp{std::get<0>(limits)}; const auto second_op = LessOp{std::get<1>(limits)}; @@ -696,7 +756,13 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateThreeWay) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) diff --git a/projects/rocprim/test/rocprim/test_device_select.cpp b/projects/rocprim/test/rocprim/test_device_select.cpp index e3fef19ca20..a8737bd8058 100644 --- a/projects/rocprim/test/rocprim/test_device_select.cpp +++ b/projects/rocprim/test/rocprim/test_device_select.cpp @@ -125,10 +125,21 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) std::vector input = test_utils::get_random_data_wrapped(size, 1, 100, seed_value); std::vector flags = test_utils::get_random_data_wrapped(size, 0, 1, seed_value); - common::device_ptr d_input(input); - common::device_ptr d_flags(flags); - common::device_ptr d_output(input.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_input; + common::device_ptr d_flags; + common::device_ptr d_output; + common::device_ptr d_selected_count_output; + + if(!d_input.resize_with_memory_check(size) || !d_flags.resize_with_memory_check(size) + || !d_output.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); + d_flags.store(flags); // Calculate expected results on host std::vector expected; @@ -161,7 +172,13 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -251,9 +268,18 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) // Generate data std::vector input = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); - common::device_ptr d_input(input); - common::device_ptr d_output(input.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_input; + common::device_ptr d_output; + common::device_ptr d_selected_count_output; + + if(!d_input.resize_with_memory_check(size) || !d_output.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); // Calculate expected results on host std::vector expected; @@ -286,7 +312,13 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -366,10 +398,21 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectFlagged) std::vector input = test_utils::get_random_data_wrapped(size, 1, 100, seed_value); std::vector flags = test_utils::get_random_data_wrapped(size, 0, 1, seed_value); - common::device_ptr d_input(input); - common::device_ptr d_flags(flags); - common::device_ptr d_output(input.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_input; + common::device_ptr d_flags; + common::device_ptr d_output; + common::device_ptr d_selected_count_output; + + if(!d_input.resize_with_memory_check(size) || !d_flags.resize_with_memory_check(size) + || !d_output.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); + d_flags.store(flags); // Calculate expected results on host std::vector expected; @@ -403,7 +446,13 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectFlagged) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -502,9 +551,19 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) } // Allocate and copy to device - common::device_ptr d_input(input); - common::device_ptr d_output(input.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_input; + common::device_ptr d_output; + common::device_ptr d_selected_count_output; + + if(!d_input.resize_with_memory_check(size) + || !d_output.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_input.store(input); // Calculate expected results on host std::vector expected; @@ -541,7 +600,13 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -864,11 +929,24 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKey) seed_value); // Allocate and copy to device - common::device_ptr d_keys_input(input_keys); - common::device_ptr d_values_input(input_values); - common::device_ptr d_keys_output(input_keys.size()); - common::device_ptr d_values_output(input_values.size()); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_keys_input; + common::device_ptr d_values_input; + common::device_ptr d_keys_output; + common::device_ptr d_values_output; + common::device_ptr d_selected_count_output; + + if(!d_keys_input.resize_with_memory_check(size) + || !d_values_input.resize_with_memory_check(size) + || !d_keys_output.resize_with_memory_check(size) + || !d_values_output.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_keys_input.store(input_keys); + d_values_input.store(input_values); // Calculate expected results on host std::vector expected_keys; @@ -913,7 +991,13 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKey) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) @@ -1027,9 +1111,20 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKeyAlias) seed_value); // Allocate and copy to device - common::device_ptr d_keys_input(input_keys); - common::device_ptr d_values_input(input_values); - common::device_ptr d_selected_count_output(1); + common::device_ptr d_keys_input; + common::device_ptr d_values_input; + common::device_ptr d_selected_count_output; + + if(!d_keys_input.resize_with_memory_check(size) + || !d_values_input.resize_with_memory_check(size) + || !d_selected_count_output.resize_with_memory_check(1)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } + + d_keys_input.store(input_keys); + d_values_input.store(input_values); // Calculate expected results on host std::vector expected_keys; @@ -1074,7 +1169,13 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKeyAlias) ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - common::device_ptr d_temp_storage(temp_storage_size_bytes); + common::device_ptr d_temp_storage; + + if(!d_temp_storage.resize_with_memory_check(temp_storage_size_bytes)) + { + std::cout << "Out of memory. Skipping test for size = " << size << std::endl; + break; + } test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) diff --git a/projects/rocprim/test/rocprim/test_intrinsics.cpp b/projects/rocprim/test/rocprim/test_intrinsics.cpp index 9b4fddede63..ac54a3998b8 100644 --- a/projects/rocprim/test/rocprim/test_intrinsics.cpp +++ b/projects/rocprim/test/rocprim/test_intrinsics.cpp @@ -544,7 +544,7 @@ void masked_bit_count_kernel(unsigned int* out, const max_lane_mask_type active_lanes) { const unsigned int out_index = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int in_index = out_index / rocprim::arch::wavefront::min_size(); + const unsigned int in_index = out_index / rocprim::arch::wavefront::size(); const auto value = static_cast(in[in_index]); unsigned int result = test_type_helper::uninitialized(); @@ -655,7 +655,7 @@ void warp_any_all_kernel(unsigned int* out, { const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int predicate - = (in[index / rocprim::arch::wavefront::min_size()] >> rocprim::lane_id()) & 1; + = (in[index / rocprim::arch::wavefront::size()] >> rocprim::lane_id()) & 1; unsigned int result = test_type_helper::uninitialized(); if(is_lane_active(active_lanes, rocprim::lane_id())) @@ -1105,7 +1105,7 @@ void group_elect_kernel(max_lane_mask_type* output, const unsigned int input_index = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int output_index - = blockIdx.x * warps_per_block + threadIdx.x / ::rocprim::arch::wavefront::min_size(); + = blockIdx.x * warps_per_block + threadIdx.x / ::rocprim::arch::wavefront::size(); if(rocprim::group_elect(input[input_index])) { diff --git a/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp b/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp index eafd7327c12..72e8bb8c7e1 100644 --- a/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp +++ b/projects/rocprim/test/rocprim/test_utils_custom_test_types.hpp @@ -78,7 +78,6 @@ struct custom_non_default_type }; // Custom type used in tests -// Loops are prevented from being unrolled due to a compiler bug in ROCm 5.2 for device code template struct custom_test_array_type { @@ -90,17 +89,14 @@ struct custom_test_array_type ROCPRIM_HOST_DEVICE inline custom_test_array_type() { -#pragma unroll 1 for(size_t i = 0; i < N; i++) { values[i] = T(i + 1); } } - ROCPRIM_HOST_DEVICE inline - custom_test_array_type(T v) + ROCPRIM_HOST_DEVICE inline custom_test_array_type(T v) { -#pragma unroll 1 for(size_t i = 0; i < N; i++) { values[i] = v; @@ -111,20 +107,18 @@ struct custom_test_array_type ROCPRIM_HOST_DEVICE inline custom_test_array_type(const custom_test_array_type& other) { -#pragma unroll 1 for(size_t i = 0; i < N; i++) { values[i] = other.values[i]; } } - ROCPRIM_HOST_DEVICE inline - ~custom_test_array_type() {} + ROCPRIM_HOST_DEVICE inline ~custom_test_array_type() {} - ROCPRIM_HOST_DEVICE inline - custom_test_array_type& operator=(const custom_test_array_type& other) + ROCPRIM_HOST_DEVICE + inline custom_test_array_type& + operator=(const custom_test_array_type& other) { -#pragma unroll 1 for(size_t i = 0; i < N; i++) { values[i] = other.values[i]; @@ -132,11 +126,11 @@ struct custom_test_array_type return *this; } - ROCPRIM_HOST_DEVICE inline - custom_test_array_type operator+(const custom_test_array_type& other) const + ROCPRIM_HOST_DEVICE + inline custom_test_array_type + operator+(const custom_test_array_type& other) const { custom_test_array_type result; -#pragma unroll 1 for(size_t i = 0; i < N; i++) { result.values[i] = values[i] + other.values[i]; @@ -148,7 +142,6 @@ struct custom_test_array_type custom_test_array_type operator-(const custom_test_array_type& other) const { custom_test_array_type result; -#pragma unroll 1 for(size_t i = 0; i < N; i++) { result.values[i] = values[i] - other.values[i]; @@ -159,7 +152,6 @@ struct custom_test_array_type ROCPRIM_HOST_DEVICE inline bool operator<(const custom_test_array_type& other) const { -#pragma unroll 1 for(unsigned int i = 0; i < N; i++) { if(values[i] < other.values[i]) @@ -174,10 +166,10 @@ struct custom_test_array_type return false; } - ROCPRIM_HOST_DEVICE inline - bool operator>(const custom_test_array_type& other) const + ROCPRIM_HOST_DEVICE + inline bool + operator>(const custom_test_array_type& other) const { -#pragma unroll 1 for(unsigned int i = 0; i < N; i++) { if(values[i] > other.values[i]) @@ -192,10 +184,10 @@ struct custom_test_array_type return false; } - ROCPRIM_HOST_DEVICE inline - bool operator==(const custom_test_array_type& other) const + ROCPRIM_HOST_DEVICE + inline bool + operator==(const custom_test_array_type& other) const { -#pragma unroll 1 for(size_t i = 0; i < N; i++) { if(values[i] != other.values[i]) @@ -206,8 +198,9 @@ struct custom_test_array_type return true; } - ROCPRIM_HOST_DEVICE inline - bool operator!=(const custom_test_array_type& other) const + ROCPRIM_HOST_DEVICE + inline bool + operator!=(const custom_test_array_type& other) const { return !(*this == other); }