Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fbc51d3
First algorithm changed for modernization of configs
NB4444 Sep 17, 2025
a1439f6
Resolve "Update configs for new config system part 5"
NB4444 Nov 5, 2025
99c7297
Resolve "Update configs for new config system part 4"
NB4444 Nov 6, 2025
c5d1ebf
Resolve "Update configs for new config system part 6"
NB4444 Nov 7, 2025
74d8689
Resolve "Update configs for new config system part 3"
NB4444 Nov 7, 2025
20d93ca
Resolve "Update configs for new config system part 1"
Saiyang-Zhang Nov 12, 2025
78de57f
Resolve "Update configs for new config system part 2"
NB4444 Nov 13, 2025
9cf6496
Update the config for radix_onesweep based on upstream changes
NB4444 Nov 13, 2025
43a9915
Resolve "New Config system tests"
NB4444 Nov 17, 2025
73b13ac
Resolve "Consistency in config tags"
Saiyang-Zhang Nov 24, 2025
629896f
Resolve "Remove all unused config functions old system"
NB4444 Nov 24, 2025
b3639b6
Resolve "Update autotune create_optimization script for new config sy…
NB4444 Nov 25, 2025
079fad8
Resolve "Update apply_config_improvement script for new configs"
NB4444 Nov 27, 2025
fa3b066
Added to CHANGELOG
NB4444 Nov 27, 2025
0b5a48e
Cleanup target_config
NB4444 Nov 28, 2025
293777c
Fix base block methods adjacent_difference_config
NB4444 Dec 2, 2025
6448f13
Clear previous caches before current one is created
NB4444 Dec 4, 2025
a6a1cf6
Give device_histogram the same fallback as previous configs system and
NB4444 Dec 5, 2025
8afe9ae
Resolve "Fix generic compile target new config system"
NB4444 Dec 11, 2025
79679b6
Manually fixing the worst regression after fixing predicate_flag
NB4444 Dec 11, 2025
c9fcc81
Add more arch for configs
NB4444 Dec 11, 2025
f23bea4
Add more supported architectures
NB4444 Dec 16, 2025
6dd20f0
Scope the define to rocprim
NB4444 Dec 16, 2025
f4be231
Add temp fix for failing test
NB4444 Dec 18, 2025
405ded6
TEMP FIX: instead of disabling optimization for array size one larger
NB4444 Dec 19, 2025
984c824
Replace workaround with less undefined fix
NB4444 Jan 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,24 @@ namespace detail
{

template<class Config,
class Selector,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class ResultType,
class BinaryFunction>
inline hipError_t launch_segmented_arg_minmax(::rocprim::detail::target_arch arch,
InputIterator input,
OutputIterator output,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
BinaryFunction reduce_op,
ResultType initial_value,
ResultType empty_value,
dim3 grid,
dim3 block,
size_t shmem,
hipStream_t stream)
inline hipError_t launch_segmented_arg_minmax(::rocprim::detail::target current_target,
InputIterator input,
OutputIterator output,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
BinaryFunction reduce_op,
ResultType initial_value,
ResultType empty_value,
dim3 grid,
dim3 block,
size_t shmem,
hipStream_t stream)
{
auto kernel = [=](auto arch_config)
{
Expand Down Expand Up @@ -103,7 +104,12 @@ inline hipError_t launch_segmented_arg_minmax(::rocprim::detail::target_arch arc
}
};

return ::rocprim::detail::execute_launch_plan<Config>(arch, kernel, grid, block, shmem, stream);
return ::rocprim::detail::execute_launch_plan<Config, Selector>(current_target,
kernel,
grid,
block,
shmem,
stream);
}

/// Dispatch function similar to \p rocprim::segmented_reduce but writes \p empty_value for empty
Expand All @@ -129,17 +135,24 @@ inline hipError_t segmented_arg_minmax(void* temporary_storage,
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using result_type = ::rocprim::accumulator_t<BinaryFunction, input_type>;

using config = ::rocprim::detail::wrapped_reduce_config<Config, result_type>;
using selector = ::rocprim::detail::segmented_reduce_config_selector<result_type>;

::rocprim::detail::target_arch target_arch;
hipError_t result = host_target_arch(stream, target_arch);
hipError_t result = ::rocprim::detail::host_target_arch(stream, target_arch);
if(result != hipSuccess)
{
return result;
}
const ::rocprim::detail::reduce_config_params params
= ::rocprim::detail::dispatch_target_arch<config, false>(target_arch);
::rocprim::detail::gpu target_gpu;
result = ::rocprim::detail::host_target_gpu(stream, target_gpu);
if(result != hipSuccess)
{
return result;
}

const ::rocprim::detail::target current_target(target_arch, target_gpu);

const auto params = ::rocprim::detail::get_config<selector>(Config{}, current_target);
const unsigned int block_size = params.kernel_config.block_size;

if(temporary_storage == nullptr)
Expand All @@ -160,18 +173,18 @@ inline hipError_t segmented_arg_minmax(void* temporary_storage,
start = std::chrono::high_resolution_clock::now();
}
ROCPRIM_RETURN_ON_ERROR(
launch_segmented_arg_minmax<config>(target_arch,
input,
output,
begin_offsets,
end_offsets,
reduce_op,
static_cast<result_type>(initial_value),
static_cast<result_type>(empty_value),
dim3(segments),
dim3(block_size),
0,
stream));
launch_segmented_arg_minmax<Config, selector>(current_target,
input,
output,
begin_offsets,
end_offsets,
reduce_op,
static_cast<result_type>(initial_value),
static_cast<result_type>(empty_value),
dim3(segments),
dim3(block_size),
0,
stream));
HIPCUB_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_arg_minmax", segments, start);

return hipSuccess;
Expand Down
6 changes: 6 additions & 0 deletions projects/rocprim/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/).

## rocPRIM x.y.z for ROCm 8.0

### Optimizations

* Updated config system to pick better fallback configs for untuned GPUs.

## rocPRIM 4.2.0 for ROCm 7.2

### Added
Expand Down
15 changes: 10 additions & 5 deletions projects/rocprim/benchmark/benchmark_device_batch_memcpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,18 +133,23 @@ BatchMemcpyData<ValueType, BufferSizeType> prepare_data(hipStream_t stre

BatchMemcpyData<ValueType, BufferSizeType> result;

using config
= rocprim::detail::wrapped_batch_memcpy_config<rocprim::default_config, ValueType, true>;
using Selector = rocprim::detail::batch_memcpy_config_selector<ValueType, IsMemCpy>;

rocprim::detail::target_arch target_arch;
hipError_t success = rocprim::detail::host_target_arch(stream, target_arch);
hipError_t success = host_target_arch(stream, target_arch);

rocprim::detail::gpu target_gpu;
success = host_target_gpu(stream, target_gpu);

if(success != hipSuccess)
{
return result;
}

const rocprim::detail::batch_memcpy_config_params params
= rocprim::detail::dispatch_target_arch<config, false>(target_arch);
const rocprim::detail::target get_target(target_arch, target_gpu);

const auto params
= rocprim::detail::get_config<Selector>(rocprim::default_config{}, get_target);

const int32_t wlev_min_size = params.wlev_size_threshold;
const int32_t blev_min_size = params.blev_size_threshold;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,21 @@ struct device_histogram_benchmark : public benchmark_utils::autotune_interface
+ ",cfg:" + config_name<Config>() + "}");
}

template<class... Args>
void clear_other_caches()
{
(
[](auto u)
{
using U = decltype(u);
if(!std::is_same_v<T, U>)
{
input_cache<U>::instance().clear();
}
}(Args{}),
...);
}

void run(benchmark_utils::state&& state) override
{
const auto& stream = state.stream;
Expand All @@ -220,6 +235,16 @@ struct device_histogram_benchmark : public benchmark_utils::autotune_interface
};
};

// Clear caches for other types that are either empty or already done.
clear_other_caches<rocprim::int128_t,
int64_t,
int,
short,
int8_t,
double,
float,
rocprim::half>();

const std::size_t size = bytes / Channels;

size_t temporary_storage_bytes = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ template<typename T,
typename Config = rocprim::default_config>
struct device_transform_benchmark : public benchmark_utils::autotune_interface
{

std::string name() const override
{

Expand Down Expand Up @@ -122,13 +121,15 @@ struct device_transform_benchmark : public benchmark_utils::autotune_interface
{
const auto launch = [&]
{
using Selector = rocprim::detail::transform_config_selector<T, IsPointer>;
auto transform_op = [](T v) { return v + T(5); };
return rocprim::detail::transform_impl<IsPointer, Config>(d_input.get(),
d_output.get(),
size,
transform_op,
stream,
debug_synchronous);
return rocprim::detail::transform_impl<IsPointer, Config, Selector>(
d_input.get(),
d_output.get(),
size,
transform_op,
stream,
debug_synchronous);
};

state.run([&] { HIP_CHECK(launch()); });
Expand Down
126 changes: 69 additions & 57 deletions projects/rocprim/rocprim/include/rocprim/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,88 +152,100 @@
#if !defined(ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS)
#define ROCPRIM_THREAD_STORE_USE_CACHE_MODIFIERS 1
#endif
#define IS_CDNA3() \
__builtin_amdgcn_processor_is("gfx942") || __builtin_amdgcn_processor_is("gfx950") \
|| __builtin_amdgcn_processor_is("gfx9-4-generic")
#define IS_CDNA2() __builtin_amdgcn_processor_is("gfx90a")
#define IS_CDNA1() __builtin_amdgcn_processor_is("gfx908")
#define IS_GCN5() \
__builtin_amdgcn_processor_is("gfx900") || __builtin_amdgcn_processor_is("gfx902") \
|| __builtin_amdgcn_processor_is("gfx904") || __builtin_amdgcn_processor_is("gfx906") \
|| __builtin_amdgcn_processor_is("gfx90c") \
|| __builtin_amdgcn_processor_is("gfx9-generic")
#define IS_RDNA4() \
__builtin_amdgcn_processor_is("gfx1200") || __builtin_amdgcn_processor_is("gfx1201") \
|| __builtin_amdgcn_processor_is("gfx12-generic") // TODO: Re-enable gfx1250 when supported by compiler
#define IS_RDNA3() \
__builtin_amdgcn_processor_is("gfx1100") || __builtin_amdgcn_processor_is("gfx1101") \
|| __builtin_amdgcn_processor_is("gfx1102") \
|| __builtin_amdgcn_processor_is("gfx1103") \
|| __builtin_amdgcn_processor_is("gfx1152") \
|| __builtin_amdgcn_processor_is("gfx1153") \
|| __builtin_amdgcn_processor_is("gfx11-generic")
#define IS_RDNA2() \
__builtin_amdgcn_processor_is("gfx1030") || __builtin_amdgcn_processor_is("gfx1031") \
|| __builtin_amdgcn_processor_is("gfx1032") \
|| __builtin_amdgcn_processor_is("gfx1033") \
|| __builtin_amdgcn_processor_is("gfx1034") \
|| __builtin_amdgcn_processor_is("gfx1035") \
|| __builtin_amdgcn_processor_is("gfx1036") \
|| __builtin_amdgcn_processor_is("gfx10-3-generic")
#define IS_RDNA1() \
__builtin_amdgcn_processor_is("gfx1010") || __builtin_amdgcn_processor_is("gfx1011") \
|| __builtin_amdgcn_processor_is("gfx1012") \
|| __builtin_amdgcn_processor_is("gfx1013") \
|| __builtin_amdgcn_processor_is("gfx10-1-generic")
#define IS_GCN3() \
__builtin_amdgcn_processor_is("gfx801") || __builtin_amdgcn_processor_is("gfx802") \
|| __builtin_amdgcn_processor_is("gfx803") || __builtin_amdgcn_processor_is("gfx805") \
|| __builtin_amdgcn_processor_is("gfx810")
#define ROCPRIM_IS_CDNA3() \
(__builtin_amdgcn_processor_is("gfx942") || __builtin_amdgcn_processor_is("gfx950") \
|| __builtin_amdgcn_processor_is("gfx9-4-generic"))
#define ROCPRIM_IS_CDNA2() (__builtin_amdgcn_processor_is("gfx90a"))
#define ROCPRIM_IS_CDNA1() (__builtin_amdgcn_processor_is("gfx908"))
#define ROCPRIM_IS_GCN5() \
(__builtin_amdgcn_processor_is("gfx900") || __builtin_amdgcn_processor_is("gfx902") \
|| __builtin_amdgcn_processor_is("gfx904") || __builtin_amdgcn_processor_is("gfx906") \
|| __builtin_amdgcn_processor_is("gfx90c") \
|| __builtin_amdgcn_processor_is("gfx9-generic"))
#define ROCPRIM_IS_RDNA4() \
(__builtin_amdgcn_processor_is("gfx1200") || __builtin_amdgcn_processor_is("gfx1201") \
|| __builtin_amdgcn_processor_is( \
"gfx12-generic")) // TODO: Re-enable gfx1250 when supported by compiler
#define ROCPRIM_IS_RDNA3() \
(__builtin_amdgcn_processor_is("gfx1100") || __builtin_amdgcn_processor_is("gfx1101") \
|| __builtin_amdgcn_processor_is("gfx1102") || __builtin_amdgcn_processor_is("gfx1103") \
|| __builtin_amdgcn_processor_is("gfx1150") || __builtin_amdgcn_processor_is("gfx1151") \
|| __builtin_amdgcn_processor_is("gfx1152") || __builtin_amdgcn_processor_is("gfx1153") \
|| __builtin_amdgcn_processor_is("gfx11-generic"))
#define ROCPRIM_IS_RDNA2() \
(__builtin_amdgcn_processor_is("gfx1030") || __builtin_amdgcn_processor_is("gfx1031") \
|| __builtin_amdgcn_processor_is("gfx1032") || __builtin_amdgcn_processor_is("gfx1033") \
|| __builtin_amdgcn_processor_is("gfx1034") || __builtin_amdgcn_processor_is("gfx1035") \
|| __builtin_amdgcn_processor_is("gfx1036") \
|| __builtin_amdgcn_processor_is("gfx10-3-generic"))
#define ROCPRIM_IS_RDNA1() \
(__builtin_amdgcn_processor_is("gfx1010") || __builtin_amdgcn_processor_is("gfx1011") \
|| __builtin_amdgcn_processor_is("gfx1012") || __builtin_amdgcn_processor_is("gfx1013") \
|| __builtin_amdgcn_processor_is("gfx10-1-generic"))
#define ROCPRIM_IS_GCN3() \
(__builtin_amdgcn_processor_is("gfx801") || __builtin_amdgcn_processor_is("gfx802") \
|| __builtin_amdgcn_processor_is("gfx803") || __builtin_amdgcn_processor_is("gfx805") \
|| __builtin_amdgcn_processor_is("gfx810"))
#define ROCPRIM_IS_GENERIC() \
(__builtin_amdgcn_processor_is("gfx9-4-generic") \
|| __builtin_amdgcn_processor_is("gfx9-generic") \
|| __builtin_amdgcn_processor_is("gfx11-generic") \
|| __builtin_amdgcn_processor_is("gfx10-3-generic") \
|| __builtin_amdgcn_processor_is("gfx10-1-generic") \
|| __builtin_amdgcn_processor_is("gfx12-generic"))
#else
#if defined(ROCPRIM_TARGET_CDNA3)
#define IS_CDNA3() 1
#define ROCPRIM_IS_CDNA3() 1
#else
#define IS_CDNA3() 0
#define ROCPRIM_IS_CDNA3() 0
#endif
#if defined(ROCPRIM_TARGET_CDNA2)
#define IS_CDNA2() 1
#define ROCPRIM_IS_CDNA2() 1
#else
#define IS_CDNA2() 0
#define ROCPRIM_IS_CDNA2() 0
#endif
#if defined(ROCPRIM_TARGET_CDNA1)
#define IS_CDNA1() 1
#define ROCPRIM_IS_CDNA1() 1
#else
#define IS_CDNA1() 0
#define ROCPRIM_IS_CDNA1() 0
#endif
#if defined(ROCPRIM_TARGET_GCN5)
#define IS_GCN5() 1
#define ROCPRIM_IS_GCN5() 1
#else
#define IS_GCN5() 0
#define ROCPRIM_IS_GCN5() 0
#endif
#if defined(ROCPRIM_TARGET_RDNA4)
#define IS_RDNA4() 1
#define ROCPRIM_IS_RDNA4() 1
#else
#define IS_RDNA4() 0
#define ROCPRIM_IS_RDNA4() 0
#endif
#if defined(ROCPRIM_TARGET_RDNA3)
#define IS_RDNA3() 1
#define ROCPRIM_IS_RDNA3() 1
#else
#define IS_RDNA3() 0
#define ROCPRIM_IS_RDNA3() 0
#endif
#if defined(ROCPRIM_TARGET_RDNA2)
#define IS_RDNA2() 1
#define ROCPRIM_IS_RDNA2() 1
#else
#define IS_RDNA2() 0
#define ROCPRIM_IS_RDNA2() 0
#endif
#if defined(ROCPRIM_TARGET_RDNA1)
#define IS_RDNA1() 1
#define ROCPRIM_IS_RDNA1() 1
#else
#define IS_RDNA1() 0
#define ROCPRIM_IS_RDNA1() 0
#endif
#if defined(ROCPRIM_TARGET_GCN3)
#define IS_GCN3() 1
#define ROCPRIM_IS_GCN3() 1
#else
#define IS_GCN3() 0
#define ROCPRIM_IS_GCN3() 0
#endif

#if defined(__gfx9_generic__) || defined(__gfx9_4_generic__) || defined(__gfx10_1_generic__) \
|| defined(__gfx10_3_generic__) || defined(__gfx11_generic__) \
|| defined(__gfx12_generic__)
#define ROCPRIM_IS_GENERIC() 1
#else
#define ROCPRIM_IS_GENERIC() 0
#endif

#if !defined(ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS)
Expand Down Expand Up @@ -267,7 +279,7 @@
#define ROCPRIM_DETAIL_HAS_DPP 1
#endif

#if (!defined(ROCPRIM_DISABLE_DPP) || ROCPRIM_DISABLE_DPP == 0) \
#if(!defined(ROCPRIM_DISABLE_DPP) || ROCPRIM_DISABLE_DPP == 0) \
&& (defined(ROCPRIM_DETAIL_HAS_DPP) && ROCPRIM_DETAIL_HAS_DPP == 1)
#define ROCPRIM_DETAIL_USE_DPP 1
#else
Expand All @@ -292,7 +304,7 @@
/// Quad size (group of 4 threads)
#define ROCPRIM_QUAD_SIZE 4u

#if (defined(_MSC_VER) && !defined(__clang__)) || (defined(__GNUC__) && !defined(__clang__))
#if(defined(_MSC_VER) && !defined(__clang__)) || (defined(__GNUC__) && !defined(__clang__))
#define ROCPRIM_UNROLL
#define ROCPRIM_NO_UNROLL
#else
Expand Down
Loading
Loading