Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
84 commits
Select commit Hold shift + click to select a range
1e13119
Abstract benchmarking loop
MyNameIsTrez Feb 7, 2025
2c0800c
Apply new benchmark abstraction - Part 2
MyNameIsTrez Feb 20, 2025
7348a1d
Apply new benchmark abstraction - Part 1
MyNameIsTrez Feb 25, 2025
43da4c0
Resolve "Introduce device_ptr to Benchmarks Part 3"
yungshengtu Feb 26, 2025
f6f2195
Resolve "Introduce device_ptr to Benchmarks Part 5"
yungshengtu Mar 5, 2025
29a6d12
Apply new benchmark abstraction - Part 3
MyNameIsTrez Mar 12, 2025
8bffaad
Apply new benchmark abstraction - Part 4
MyNameIsTrez Mar 13, 2025
a472e80
Resolve "Remove [[deprecated]]float_bit_mask and all uses of it from …
cenxuantian Jan 30, 2025
38ac5a5
Resolve "Remove short_radix_bits in segmented_radix_sort_config_params"
NB4444 Jan 31, 2025
4d265b7
Remove deprecations
NB4444 Jan 31, 2025
0249ce1
Resolve "Fix "warning: explicit specialization cannot have a storage …
NB4444 Jan 31, 2025
7c664b3
Resolve "Move rocprim::detail::radix_key_codec_base into traits system"
cenxuantian Feb 21, 2025
e642d8e
Apply new benchmark abstraction - Part 8
MyNameIsTrez Mar 14, 2025
d6ba6da
Apply new benchmark abstraction - Part 6
MyNameIsTrez Mar 17, 2025
06ce1d9
Fix merge issues and clang format
NB4444 Mar 18, 2025
79b4655
Resolve "Implement tuning for rocprim::search_n"
cenxuantian Mar 20, 2025
95ad769
Resolve "Apply new benchmark abstraction - Part 7"
ApoorvaKalyani Mar 21, 2025
eed8369
Resolve "Introduce device_ptr to BenchmarksPart 8"
yungshengtu Mar 21, 2025
0c5e3dd
Resolve "Introduce device_ptr to Benchmarks Part 2"
yungshengtu Mar 24, 2025
ca3b0cd
Resolve "Add virtual shared memory fallback to device_merge"
NB4444 Mar 25, 2025
cff2e16
Resolve "Add device-level inclusive_scan with initial value support"
Mar 27, 2025
a755431
Resolve "Make use of vectorized load in rocprim::transform"
jblok27 Mar 28, 2025
300de6e
Fix autotuning in benchmark_device_transform
NB4444 Mar 31, 2025
ffd4887
Apply new benchmark abstraction - Part 9
MyNameIsTrez Mar 31, 2025
dacfb1e
Resolve "Introduce device_ptr to Benchmarks Part 4"
yungshengtu Mar 31, 2025
a88ed02
Resolve "Introduce device_ptr to Benchmarks Part 6"
yungshengtu Apr 1, 2025
2148abe
Resolve "Autotune failure on benchmark_device_segmented_radix_sort_pa…
NB4444 Apr 2, 2025
68354b7
Derive 'ROCPRIM_WAVEFRONT_SIZE' from architecture defines
MyNameIsTrez Apr 2, 2025
5f7accb
Resolve "Change default scan accumulator type to be in line with (hip…
ApoorvaKalyani Apr 2, 2025
b135286
Apply new benchmark abstraction - Part 10
MyNameIsTrez Apr 2, 2025
ff1b0c5
feat(arch.hpp): implement mechanism for wavefront size-based dispatching
Naraenda Apr 2, 2025
c203cff
Deduplicate benchmark_device_binary_search
MyNameIsTrez Apr 3, 2025
684fd34
Put the Google Benchmark state in benchmark_utils::state
MyNameIsTrez Apr 4, 2025
fd2168f
Abstract device_histogram's benchmarking
MyNameIsTrez Apr 4, 2025
bfd2e5f
Resolve "Add SPIR-V to rocPRIM CI"
borysborys Apr 7, 2025
8825597
Resolve "Introduce device_ptr to Benchmarks Part 7"
yungshengtu Apr 7, 2025
19947dd
Replace benchmark template with regular parameter
MyNameIsTrez Apr 7, 2025
d91b967
CI Fix spirv build benchmark and tests
NB4444 Apr 7, 2025
6c747d3
Resolve "Fix compilation failure in hipCUB/rocThrust to rocPRIM."
yungshengtu Apr 8, 2025
53a1bea
fix: fix various compile issues when targeting spir-v
Naraenda Apr 8, 2025
9d08154
Resolve "Check device_transform for cuda parity"
NB4444 Apr 11, 2025
5f42dc3
Rename "tmp" to "unused" for clarity
MyNameIsTrez Apr 11, 2025
97e8f77
Resolve "Apply new benchmark abstraction to device_search_n"
ApoorvaKalyani Apr 11, 2025
87b473f
Resolve "rocm 6.4 failures in rocprim"
NB4444 Apr 14, 2025
2a54a85
Remove REGISTER_BENCHMARK(), config_autotune_interface, and config_au…
MyNameIsTrez Apr 7, 2025
b629596
Resolve "Replace all ROCPRIM_IF_CONSTEXPR with constexpr"
ApoorvaKalyani Apr 14, 2025
fdac589
Output JSON benchmark statistics
MyNameIsTrez Apr 15, 2025
e16e3da
Fix benchmarking assert
MyNameIsTrez Apr 15, 2025
8c50b54
ci(.gitlab-ci.yml): add tests for spirv target
Naraenda Apr 15, 2025
1ed975a
Resolve "SPIR-V: warp sort"
NB4444 Apr 15, 2025
58e7c0b
Initialize total_gbench_iterations and total_size to 0
MyNameIsTrez Apr 15, 2025
a69a9bb
Fix compile warning in thread_load for the new compiler
NB4444 Apr 15, 2025
9fe68bc
Stop repeating tests three times
MyNameIsTrez Apr 16, 2025
960461d
Disable dispatching with macro for usage with spir-v
NB4444 Apr 16, 2025
6ad6c2d
Call non-static method properly
MyNameIsTrez Apr 16, 2025
dd360e9
Fix unintended benchmark JSON format changes
MyNameIsTrez Apr 16, 2025
c50cd7a
Extra warp_sort check in tests
NB4444 Apr 17, 2025
0b58176
Fix benchmarks that call set_throughput() more than once
MyNameIsTrez Apr 17, 2025
d1beef2
Lower benchmark_device_batch_memcpy from 1 KiB to 0 Bytes
MyNameIsTrez Apr 17, 2025
4239ef5
Resolve "Match CUB's behavior in rocPRIM for device merge"
sikba Apr 22, 2025
7caf280
Resolve "device_merge_sort custom_huge_type failing test"
NB4444 Apr 23, 2025
527c24c
fix(intrinsics/atomics.hpp): fix atomics when compiler to spirv
Naraenda Apr 9, 2025
668f913
Resolve "Create tests for rocPRIM's bit_cast"
Saiyang-Zhang Apr 24, 2025
5815656
fix: improve compatibility with spir-v target in algorithms using 'la…
Naraenda Apr 24, 2025
ab9dc0a
Resolve "SPIR-V: warp reduce/scan"
Saiyang-Zhang Apr 24, 2025
9f0dcf1
Resolve "SPIR-V: block scan/reduce/RLD"
Saiyang-Zhang Apr 25, 2025
75820ee
Resolve "Temporarily stop running device_partition test for SPIR-V du…
yungshengtu May 1, 2025
54802ef
Resolve "fallback to host side input generation for device_run_length…
borysborys May 1, 2025
d3a8911
Resolve "SPIR-V: warp exchange/load/store"
yungshengtu May 2, 2025
cff88f8
Resolve "SPIR-V: block exchange/load/store (and funcs)"
yungshengtu May 2, 2025
b2bb04c
Resolve "device_run_length_encode failing test"
borysborys May 5, 2025
9fc6fff
test: fix clangd language server errors for tests that use generated …
Naraenda Apr 23, 2025
cc1c028
Resolve "SPIR-V: block radix rank/sort"
Saiyang-Zhang May 8, 2025
5daf2de
Resolve "REVERT: fallback to host side input generation for device_ru…
borysborys May 9, 2025
4784b69
Added generic pragmas and created fallback for atomics
NB4444 May 9, 2025
2db9f4e
ci(.gitlab-ci.yml): add timeout to spirv tests
Naraenda May 9, 2025
1ed863a
Resolve "SPIR-V: lookback_scan_state"
NB4444 May 12, 2025
7da3c6a
Resolve "Prepare to move 'lookback_scan' to public API"
parbenc May 12, 2025
9d6dd68
Clang format
NB4444 May 12, 2025
ab1ef9e
fix: skip including the init value in block aggregate for warp and bl…
Naraenda May 12, 2025
578f5a1
CHANGELOG update
NB4444 May 14, 2025
cff99e3
Fix failing test device_scan
NB4444 May 14, 2025
d210bc2
update rocprim version to 4.0.0
NB4444 May 20, 2025
8ed4a9a
Fix build error in benchmark_utils
NB4444 May 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 94 additions & 12 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ workflow:
when: never
- if: $CI_MERGE_REQUEST_TITLE !~ /Draft:/
variables:
ROCPRIM_TEST_RUNS: 3
ROCPRIM_TEST_RUNS: 1
- if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/
variables:
ROCPRIM_TEST_RUNS: 1
Expand Down Expand Up @@ -124,7 +124,7 @@ copyright-date:
-D AMDGPU_TARGETS=$GPU_TARGETS
-D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
-D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
-D CMAKE_CXX_STANDARD=14
-D CMAKE_CXX_STANDARD=17
-S $CI_PROJECT_DIR
-B $BUILD_DIR
- cmake
Expand Down Expand Up @@ -174,6 +174,7 @@ build:cmake-minimum-apt:
- .rules:build
variables:
EXTRA_CMAKE_CXX_FLAGS: ""
BUILD_TOOL_ARGS: ""
script:
- mkdir -p $BUILD_DIR
- cd $BUILD_DIR
Expand All @@ -196,7 +197,7 @@ build:cmake-minimum-apt:
-D CMAKE_CXX_STANDARD="$BUILD_VERSION"
-S $CI_PROJECT_DIR
-B $BUILD_DIR
- cmake --build $BUILD_DIR
- cmake --build $BUILD_DIR -- ${BUILD_TOOL_ARGS}
artifacts:
paths:
- $BUILD_DIR/.ninja_log
Expand All @@ -213,6 +214,31 @@ build:cmake-minimum-apt:
- $BUILD_DIR/test/test_*
expire_in: 1 day

build:spirv:
stage: build
needs: []
extends:
- .cmake-minimum
- .build:common
variables:
# For unknown reasons spir-v builds ignore 'clang diagnostic' pragmas that
# we use to ignore internal deprecations.
EXTRA_CMAKE_CXX_FLAGS: "-Wno-deprecated-declarations -mf16c -DROCPRIM_EXPERIMENTAL_SPIRV"
# Since not all targets are expected to build, do not stop building other
# targets when any target fails.
BUILD_TOOL_ARGS: "-k 0"
GPU_TARGETS: "amdgcnspirv"
image: "registry.streamhpc.internal/unstable-rocm:main"
allow_failure: true
parallel:
# Debug builds disabled due to excessive build times for debug test builds
matrix:
- BUILD_TYPE: Release
BUILD_TARGET: [BENCHMARK, TEST]
BUILD_VERSION: 17
artifacts:
when: always

build:cmake-latest:
stage: build
needs: []
Expand All @@ -224,7 +250,7 @@ build:cmake-latest:
matrix:
- BUILD_TYPE: Release
BUILD_TARGET: [BENCHMARK, TEST]
BUILD_VERSION: [14, 17]
BUILD_VERSION: 17

build:cmake-minimum:
needs: []
Expand All @@ -235,7 +261,7 @@ build:cmake-minimum:
matrix:
- BUILD_TYPE: [Debug, Release]
BUILD_TARGET: [BENCHMARK, TEST]
BUILD_VERSION: 14
BUILD_VERSION: 17

build:package:
stage: build
Expand All @@ -252,7 +278,7 @@ build:package:
-G Ninja
-D CMAKE_CXX_COMPILER="$AMDCLANG"
-D CMAKE_BUILD_TYPE=Release
-D CMAKE_CXX_STANDARD=14
-D CMAKE_CXX_STANDARD=17
-B $PACKAGE_DIR
-S $CI_PROJECT_DIR
- cd $PACKAGE_DIR
Expand Down Expand Up @@ -285,7 +311,7 @@ build:windows:
-D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}"
-D CMAKE_BUILD_TYPE="$BUILD_TYPE"
-D CMAKE_CXX_STANDARD=14
-D CMAKE_CXX_STANDARD=17
- cmake --build "$CI_PROJECT_DIR/build"
artifacts:
paths:
Expand Down Expand Up @@ -332,7 +358,7 @@ autotune:build:
-D AMDGPU_TARGETS=$GPU_TARGETS
-D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c
-D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx
-D CMAKE_CXX_STANDARD=14
-D CMAKE_CXX_STANDARD=17
- cmake --build . --target $BENCHMARK_TARGETS
- 'rm -rf $BUILD_DIR/benchmark/benchmark*.parallel'
# The autotune benchmarks get very large, above GitLabs upload limit. Fortunately they compress well.
Expand All @@ -359,7 +385,7 @@ autotune:build:
matrix:
- BUILD_TYPE: Release
BUILD_TARGET: TEST
BUILD_VERSION: 14
BUILD_VERSION: 17
script:
- cd $BUILD_DIR
- cmake
Expand Down Expand Up @@ -398,6 +424,62 @@ test:all-gpus:
- .test:common
- .rules:test

.test:common-spirv:
stage: test
tags:
- rocm
- $GPU
extends:
- .cmake-minimum
allow_failure: true
timeout: 3h
needs:
- job: build:spirv
parallel:
matrix:
- BUILD_TYPE: Release
BUILD_TARGET: TEST
BUILD_VERSION: 17
image: "registry.streamhpc.internal/unstable-rocm:main"
script:
- cd $BUILD_DIR
- cmake
-D CMAKE_PREFIX_PATH=/opt/rocm
-P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake
- cat ./resources.json
# Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue.
# This causes the hipMemcpy to fail, which is not reported as an error by HIP.
# As a temporary workaround, disable the SDMA for test stability.
- HSA_ENABLE_SDMA=0 ctest
--output-on-failure
--repeat-until-fail 2
--resource-spec-file ./resources.json
--parallel $PARALLEL_JOBS
--exclude-regex rocprim.device_partition

test:any-gpu-spirv:
variables:
GPU: ""
PARALLEL_JOBS: 1
extends:
- .test:common-spirv
rules:
- if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/

test:label-arch-spirv:
extends:
- .gpus:rocm
- .test:common-spirv
- .rules:arch-labels

test:all-gpus-spirv:
variables:
SHOULD_BE_UNDRAFTED: "true"
extends:
- .gpus:rocm
- .test:common-spirv
- .rules:test

.test-windows-base:
stage: test
extends:
Expand Down Expand Up @@ -437,7 +519,7 @@ test-windows-release:
-D CMAKE_CXX_COMPILER="$AMDCLANG"
-D CMAKE_BUILD_TYPE=Release
-D AMDGPU_TARGETS=$GPU_TARGETS
-D CMAKE_CXX_STANDARD=14
-D CMAKE_CXX_STANDARD=17
-S "$CI_PROJECT_DIR/test/extra"
-B "$CI_PROJECT_DIR/package_test"
- cmake --build "$CI_PROJECT_DIR/package_test"
Expand All @@ -459,7 +541,7 @@ test:install:
-G Ninja
-D CMAKE_CXX_COMPILER="$AMDCLANG"
-D CMAKE_BUILD_TYPE=Release
-D CMAKE_CXX_STANDARD=14
-D CMAKE_CXX_STANDARD=17
-B build
-S $CI_PROJECT_DIR
# Preserve $PATH when sudoing
Expand Down Expand Up @@ -507,7 +589,7 @@ benchmark:
matrix:
- BUILD_TYPE: Release
BUILD_TARGET: BENCHMARK
BUILD_VERSION: 14
BUILD_VERSION: 17
extends:
- .cmake-minimum
- .gpus:rocm
Expand Down
73 changes: 72 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,19 @@

Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/).

## rocPRIM 3.6.0 for ROCm 7.0
## rocPRIM 4.0.0 for ROCm 7.0

### Added

* Added `rocprim::accumulator_t` to ensure parity with CCCL.
* Added test for `rocprim::accumulator_t`
* Added `rocprim::invoke_result_r` to ensure parity with CCCL.
* Added function `is_build_in` into `rocprim::traits::get`.
* Added virtual shared memory as a fallback option in `rocprim::device_merge` when it exceeds shared memory capacity, similar to `rocprim::device_select`, `rocprim::device_partition`, and `rocprim::device_merge_sort`, which already include this feature.
* Added initial value support to device level inclusive scans.
* Added new optimization to the backend for `device_transform` when the input and output are pointers.
* Added `LoadType` to `transform_config`, which is used for the `device_transform` when the input and output are pointers.
* Added `rocprim:device_transform` for n-ary transform operations API with as input `n` number of iterators inside a `rocprim::tuple`.

### Changed

Expand All @@ -24,6 +36,62 @@ This is a complete list of affected functions and how their default accumulator
* past default: `class AccType = detail::input_type_t<InitValueType>>`
* new default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>`

* Changed the parameters `long_radix_bits` and `LongRadixBits` from `segmented_radix_sort` to `radix_bits` and `RadixBits` respectively.
* Marked the initialisation constructor of `rocprim::reverse_iterator<Iter>` `explicit`, use `rocprim::make_reverse_iterator`.
* Merged `radix_key_codec` into type_traits system.
* Renamed `type_traits_interface.hpp` to `type_traits.hpp`, rename the original `type_traits.hpp` to `type_traits_functions.hpp`.
* Changed the default accumulator type for various device-level scan algorithms:
* `rocprim::inclusive_scan`
* Previous default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
* Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, typename std::iterator_traits<InputIterator>::value_type>`
* `rocprim::deterministic_inclusive_scan`
* Previous default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
* Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, typename std::iterator_traits<InputIterator>::value_type>`
* `rocprim::exclusive_scan`
* Previous default: `class AccType = detail::input_type_t<InitValueType>>`
* Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, rocprim::detail::input_type_t<InitValueType>>`
* `rocprim::deterministic_exclusive_scan`
* Previous default: `class AccType = detail::input_type_t<InitValueType>>`
* Current default: `class AccType = rocprim::accumulator_t<BinaryFunction, rocprim::detail::input_type_t<InitValueType>>`

### Deprecations

* `rocprim::invoke_result_binary_op` and `rocprim::invoke_result_binary_op_t` are deprecated. Use `rocprim::accumulator_t` now.

### Removed

* Removed `rocprim::detail::float_bit_mask` and relative tests, use `rocprim::traits::float_bit_mask` instead.
* Removed `rocprim::traits::is_fundamental`, please use `rocprim::traits::get<T>::is_fundamental()` directly.
* Removed the deprecated parameters `short_radix_bits` and `ShortRadixBits` from the `segmented_radix_sort` config. They were unused, it is only an API change.
* Removed the deprecated `operator<<` from the iterators.
* Removed the deprecated `TwiddleIn` and `TwiddleOut`. Use `radix_key_codec` instead.
* Removed the deprecated flags API of `block_adjacent_difference`. Use `subtract_left()` or `block_discontinuity::flag_heads()` instead.
* Removed the deprecated `to_exclusive` functions in the warp scans.
* Removed the `rocprim::load_cs` from the `cache_load_modifier` enum. Use `rocprim::load_nontemporal` instead.
* Removed the `rocprim::store_cs` from the `cache_store_modifier` enum. Use `rocprim::store_nontemporal` instead.
* Removed the deprecated header file `rocprim/detail/match_result_type.hpp`. Include `rocprim/type_traits.hpp` instead.
* This header included `rocprim::detail::invoke_result`. Use `rocprim::invoke_result` instead.
* This header included `rocprim::detail::invoke_result_binary_op`. Use `rocprim::invoke_result_binary_op` instead.
* This header included `rocprim::detail::match_result_type`. Use `rocprim::invoke_result_binary_op_t` instead.
* Removed the deprecated `rocprim::detail::radix_key_codec` function. Use `rocprim::radix_key_codec` instead.
* Removed `rocprim/detail/radix_sort.hpp`, functionality can now be found in `rocprim/thread/radix_key_codec.hpp`.
* Removed C++14 support, only C++17 is supported.
* Due to the removal of `__AMDGCN_WAVEFRONT_SIZE` in the compiler, the following deprecated warp size-related symbols have been removed:
* `rocprim::device_warp_size()`
* For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory.
* For run-time constants, this is replaced with `rocprim::arch::wavefront::size().`
* `rocprim::warp_size()`
* Use `rocprim::host_warp_size()`, `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead.
* `ROCPRIM_WAVEFRONT_SIZE`
* Use `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead.
* `__AMDGCN_WAVEFRONT_SIZE`
* This was a fallback define for the compiler's removed symbol, having the same name.

### Resolved issues

* Fixed an issue where `device_batch_memcpy` reported benchmarking throughput being 2x lower than it was in reality.
* Fixed an issue where `device_segmented_reduce` reported autotuning throughput being 5x lower than it was in reality.

## rocPRIM 3.5.0 for ROCm 6.5.0

### Removed
Expand All @@ -40,6 +108,7 @@ This is a complete list of affected functions and how their default accumulator
* Added the `rocprim::merge_inplace` function for merging in-place.
* Added initial value support for warp- and block-level inclusive scan.
* Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag.
* Added tests and documentation to `lookback_scan_state`. It is still in the `detail` namespace.

### Changed

Expand Down Expand Up @@ -599,3 +668,5 @@ The following is the complete list of affected functions and how their default a

* Switched to HIP-Clang as the default compiler
* CMake searches for rocPRIM locally first; if t's not found, CMake downloads it from GitHub


9 changes: 3 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,8 @@ set(CMAKE_HIP_STANDARD 14)
set(CMAKE_HIP_STANDARD_REQUIRED ON)
set(CMAKE_HIP_EXTENSIONS OFF)

# Set CXX standard
if (CMAKE_CXX_STANDARD EQUAL 14)
message(WARNING "C++14 will be deprecated in the next major release")
elseif(NOT CMAKE_CXX_STANDARD EQUAL 17)
message(FATAL_ERROR "Only C++14 and C++17 are supported")
if(NOT CMAKE_CXX_STANDARD EQUAL 17)
message(FATAL_ERROR "Only C++17 is supported")
endif()

if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
Expand Down Expand Up @@ -183,7 +180,7 @@ if(BUILD_CODE_COVERAGE)
endif()

# Setup VERSION
set(VERSION_STRING "3.5.0")
set(VERSION_STRING "4.0.0")
rocm_setup_version(VERSION ${VERSION_STRING})
math(EXPR rocprim_VERSION_NUMBER "${rocprim_VERSION_MAJOR} * 100000 + ${rocprim_VERSION_MINOR} * 100 + ${rocprim_VERSION_PATCH}")

Expand Down
3 changes: 2 additions & 1 deletion benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -174,6 +174,7 @@ add_rocprim_benchmark(benchmark_device_segmented_radix_sort_keys.cpp)
add_rocprim_benchmark(benchmark_device_segmented_radix_sort_pairs.cpp)
add_rocprim_benchmark(benchmark_device_segmented_reduce.cpp)
add_rocprim_benchmark(benchmark_device_transform.cpp)
add_rocprim_benchmark(benchmark_device_transform_pointer.cpp)
add_rocprim_benchmark(benchmark_predicate_iterator.cpp)
add_rocprim_benchmark(benchmark_warp_exchange.cpp)
add_rocprim_benchmark(benchmark_warp_reduce.cpp)
Expand Down
19 changes: 15 additions & 4 deletions benchmark/ConfigAutotuneSettings.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,23 @@ ${TUNING_TYPES};${LIMITED_TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT
set(list_across "\
binary_search upper_bound lower_bound;${TUNING_TYPES};${LIMITED_TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE)
set(output_pattern_suffix "@SubAlgorithm@_@ValueType@_@OutputType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE)
elseif(file STREQUAL "benchmark_device_search_n")
set(list_across_names "InputType;BlockSize;ItemsPerThread;Threshold" PARENT_SCOPE)
set(list_across "\
${TUNING_TYPES};64 128 256 512 1024;1 2 4 8 16;4 8 12 16" PARENT_SCOPE)
set(output_pattern_suffix "@InputType@_@BlockSize@_@ItemsPerThread@_@Threshold@" PARENT_SCOPE)
elseif(file STREQUAL "benchmark_device_segmented_radix_sort_keys")
set(list_across_names "\
KeyType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
KeyType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
set(list_across "${TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE)
set(output_pattern_suffix "\
@KeyType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
@KeyType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
elseif(file STREQUAL "benchmark_device_segmented_radix_sort_pairs")
set(list_across_names "\
KeyType;ValueType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
KeyType;ValueType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE)
set(list_across "${TUNING_TYPES};${LIMITED_TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE)
set(output_pattern_suffix "\
@KeyType@_@ValueType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
@KeyType@_@ValueType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE)
elseif(file STREQUAL "benchmark_device_segmented_reduce")
set(list_across_names "DataType;BlockSize;ItemsPerThread" PARENT_SCOPE)
set(list_across "\
Expand All @@ -108,6 +113,12 @@ DataType;BlockSize;" PARENT_SCOPE)
set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE)
set(output_pattern_suffix "\
@DataType@_@BlockSize@" PARENT_SCOPE)
elseif(file STREQUAL "benchmark_device_transform_pointer")
set(list_across_names "\
DataType;BlockSize;LoadType" PARENT_SCOPE)
set(list_across "${TUNING_TYPES};64 128 256 512 1024;rocprim::load_default rocprim::load_nontemporal" PARENT_SCOPE)
set(output_pattern_suffix "\
@DataType@_@BlockSize@_@LoadType@" PARENT_SCOPE)
elseif(file STREQUAL "benchmark_device_partition")
set(list_across_names "DataType;BlockSize" PARENT_SCOPE)
set(list_across "${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE)
Expand Down
Loading