diff --git a/CHANGELOG.md b/CHANGELOG.md index f405848e9..bfa0aa7c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,12 @@ This is a complete list of affected functions and how their default accumulator * Added the `rocprim::merge_inplace` function for merging in-place. * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. +* Added additional unit tests for `test_block_load.hpp` +* Added additional unit tests for `test_block_rank.hpp` +* Added additional unit tests for `test_block_scan.hpp` +* Added additional unit tests for `test_block_sort.hpp` +* Added additional unit tests for `test_block_store.hpp` +* Added missing `rank_keys_desc` with `digit_extractor` parameter for `block_radix_rank_match.hpp` ### Changed diff --git a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp index 7f3b9da6c..cf621d00c 100644 --- a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp +++ b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp @@ -271,6 +271,19 @@ class block_radix_rank_match rank_keys_impl(keys, ranks, storage.get(), begin_bit, pass_bits); } + template + ROCPRIM_DEVICE void rank_keys_desc(const Key (&keys)[ItemsPerThread], + unsigned int (&ranks)[ItemsPerThread], + storage_type& storage, + DigitExtractor digit_extractor) + { + rank_keys_impl(keys, ranks, storage.get(), + [&digit_extractor](const Key & key){ + const unsigned int digit = digit_extractor(key); + return radix_digits - 1 - digit; + }); + } + template ROCPRIM_DEVICE void rank_keys(const Key (&keys)[ItemsPerThread], unsigned int (&ranks)[ItemsPerThread], diff --git a/test/rocprim/test_block_load_store.hpp b/test/rocprim/test_block_load_store.hpp index 382a8061f..bafb9b95b 100644 --- a/test/rocprim/test_block_load_store.hpp +++ b/test/rocprim/test_block_load_store.hpp @@ -192,6 +192,179 @@ typed_test_def(suite_name, name_suffix, LoadStoreClassValid) } +typed_test_def(suite_name, name_suffix, LoadStoreClassWithStorage) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using Type = typename TestFixture::params::type; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; + // Given block size not supported + if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) + { + return; + } + + if(load_method == rocprim::block_load_method::block_load_warp_transpose + || store_method == rocprim::block_store_method::block_store_warp_transpose) + { + unsigned int host_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size)); + if(block_size % host_warp_size != 0) + { + GTEST_SKIP() << "Cannot run test of block size " << block_size + << " on a device with warp size " << host_warp_size; + } + } + + const size_t valid = items_per_block; + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector input + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + std::vector output(input.size(), (Type)0); + + // Calculate expected results on host + std::vector expected(input.size(), (Type)0); + for (size_t i = 0; i < 113; i++) + { + size_t block_offset = i * items_per_block; + for (size_t j = 0; j < items_per_block; j++) + { + if (j < valid) + { + expected[j + block_offset] = input[j + block_offset]; + } + } + } + + // Preparing device + common::device_ptr device_input(input); + // Have to initialize output for unvalid data to make sure they are not changed + common::device_ptr device_output(output); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get()); + HIP_CHECK(hipGetLastError()); + + // Reading results from device + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } + +} + +typed_test_def(suite_name, name_suffix, LoadStoreClassValidWithStorage) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using Type = typename TestFixture::params::type; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; + // Given block size not supported + if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) + { + return; + } + + if(load_method == rocprim::block_load_method::block_load_warp_transpose + || store_method == rocprim::block_store_method::block_store_warp_transpose) + { + unsigned int host_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size)); + if(block_size % host_warp_size != 0) + { + GTEST_SKIP() << "Cannot run test of block size " << block_size + << " on a device with warp size " << host_warp_size; + } + } + + const size_t valid = items_per_block - 32; + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector input + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + std::vector output(input.size(), (Type)0); + + // Calculate expected results on host + std::vector expected(input.size(), (Type)0); + for (size_t i = 0; i < 113; i++) + { + size_t block_offset = i * items_per_block; + for (size_t j = 0; j < items_per_block; j++) + { + if (j < valid) + { + expected[j + block_offset] = input[j + block_offset]; + } + } + } + + // Preparing device + common::device_ptr device_input(input); + // Have to initialize output for unvalid data to make sure they are not changed + common::device_ptr device_output(output); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_storage_valid_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get(), + valid); + HIP_CHECK(hipGetLastError()); + + // Reading results from device + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } + +} + typed_test_def(suite_name, name_suffix, LoadStoreClassDefault) { int device_id = test_common_utils::obtain_device_from_ctest(); @@ -277,3 +450,89 @@ typed_test_def(suite_name, name_suffix, LoadStoreClassDefault) ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); } } + +typed_test_def(suite_name, name_suffix, LoadStoreClassDefaultWithStorage) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using Type = typename TestFixture::params::type; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; + // Given block size not supported + if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) + { + return; + } + + if(load_method == rocprim::block_load_method::block_load_warp_transpose + || store_method == rocprim::block_store_method::block_store_warp_transpose) + { + unsigned int host_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size)); + if(block_size % host_warp_size != 0) + { + GTEST_SKIP() << "Cannot run test of block size " << block_size + << " on a device with warp size " << host_warp_size; + } + } + + const size_t valid = items_per_thread + 1; + Type _default = (Type)-1; + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector input + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Calculate expected results on host + std::vector expected(input.size(), _default); + for (size_t i = 0; i < 113; i++) + { + size_t block_offset = i * items_per_block; + for (size_t j = 0; j < items_per_block; j++) + { + if (j < valid) + { + expected[j + block_offset] = input[j + block_offset]; + } + } + } + + // Preparing device + common::device_ptr device_input(input); + common::device_ptr device_output(size); + + // Running kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_valid_default_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input.get(), + device_output.get(), + valid, + _default); + HIP_CHECK(hipGetLastError()); + + // Reading results from device + const auto output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } +} diff --git a/test/rocprim/test_block_load_store.kernels.hpp b/test/rocprim/test_block_load_store.kernels.hpp index cf8bb090a..4f1b12f83 100644 --- a/test/rocprim/test_block_load_store.kernels.hpp +++ b/test/rocprim/test_block_load_store.kernels.hpp @@ -553,6 +553,100 @@ void load_store_valid_kernel(Type* device_input, Type* device_output, size_t val store.store(device_output + offset, _items, (unsigned int)valid); } +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t<(enable_block_load_store_test::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_kernel(Type* device_input, Type* device_output) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + + ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage; + ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage; + + load.load(device_input + offset, _items, load_storage); + store.store(device_output + offset, _items, store_storage); +} + +// The default kernel in case load store storage kernel fails to build +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_kernel(Type* device_input, Type* device_output) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + load.load(device_input + offset, _items); + store.store(device_output + offset, _items); +} + +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t<(enable_block_load_store_test::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_valid_kernel(Type* device_input, Type* device_output, size_t valid) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + + ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage; + ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage; + + load.load(device_input + offset, _items, (unsigned int)valid, load_storage); + store.store(device_output + offset, _items, (unsigned int)valid, store_storage); +} + +// The default kernel in case load store storage valid kernel fails to build +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + std::enable_if_t::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_storage_valid_kernel(Type* device_input, Type* device_output, size_t valid) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + load.load(device_input + offset, _items, (unsigned int)valid); + store.store(device_output + offset, _items, (unsigned int)valid); +} + template< class Type, rocprim::block_load_method LoadMethod, @@ -574,4 +668,53 @@ void load_store_valid_default_kernel(Type* device_input, Type* device_output, si store.store(device_output + offset, _items); } +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + class Def, + std::enable_if_t<(enable_block_load_store_test::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_valid_default_storage_kernel(Type* device_input, Type* device_output, size_t valid, Def _default) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + + ROCPRIM_SHARED_MEMORY typename impl::block_load::storage_type load_storage; + ROCPRIM_SHARED_MEMORY typename impl::block_store::storage_type store_storage; + + load.load(device_input + offset, _items, (unsigned int)valid, _default, load_storage); + store.store(device_output + offset, _items, store_storage); +} + +// The default kernel in case load_store_valid_default_storage_kernel fails to build +template< + class Type, + rocprim::block_load_method LoadMethod, + rocprim::block_store_method StoreMethod, + unsigned int BlockSize, + unsigned int ItemsPerThread, + class Def, + std::enable_if_t::value), int> = 0 +> +__global__ +__launch_bounds__(BlockSize) +void load_store_valid_default_storage_kernel(Type* device_input, Type* device_output, size_t valid, Def _default) +{ + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; + using impl = get_block_load_store; + typename impl::block_load load; + typename impl::block_store store; + load.load(device_input + offset, _items, (unsigned int)valid, _default); + store.store(device_output + offset, _items); +} + #endif // TEST_BLOCK_LOAD_STORE_KERNELS_HPP_ diff --git a/test/rocprim/test_block_radix_rank.cpp.in b/test/rocprim/test_block_radix_rank.cpp.in index 8e793b5f0..656035aef 100644 --- a/test/rocprim/test_block_radix_rank.cpp.in +++ b/test/rocprim/test_block_radix_rank.cpp.in @@ -47,13 +47,20 @@ #if ROCPRIM_TEST_SUITE_SLICE == 0 TYPED_TEST_P(SUITE, RankBasic ) { test_block_radix_rank_algorithm(); } - REGISTER_TYPED_TEST_SUITE_P(SUITE, RankBasic); + TYPED_TEST_P(SUITE, RankBasicExtractor ) { test_block_radix_rank_extractor_algorithm(); } + TYPED_TEST_P(SUITE, RankBasicExtractorWithPrefixCount ) { test_block_radix_rank_extractor_with_prefix_count_algorithm(); } + REGISTER_TYPED_TEST_SUITE_P(SUITE, RankBasic, RankBasicExtractor, RankBasicExtractorWithPrefixCount); + #elif ROCPRIM_TEST_SUITE_SLICE == 1 TYPED_TEST_P(SUITE, RankMemoize) { test_block_radix_rank_algorithm(); } - REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMemoize); + TYPED_TEST_P(SUITE, RankMemoizeExtractor) { test_block_radix_rank_extractor_algorithm(); } + TYPED_TEST_P(SUITE, RankMemoizeExtractorWithPrefixCount) { test_block_radix_rank_extractor_with_prefix_count_algorithm(); } + REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMemoize, RankMemoizeExtractor, RankMemoizeExtractorWithPrefixCount); #elif ROCPRIM_TEST_SUITE_SLICE == 2 TYPED_TEST_P(SUITE, RankMatch ) { test_block_radix_rank_algorithm(); } - REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMatch); + TYPED_TEST_P(SUITE, RankMatchExtractor ) { test_block_radix_rank_extractor_algorithm(); } + TYPED_TEST_P(SUITE, RankMatchExtractorWithPrefixCount ) { test_block_radix_rank_extractor_with_prefix_count_algorithm(); } + REGISTER_TYPED_TEST_SUITE_P(SUITE, RankMatch, RankMatchExtractor, RankMatchExtractorWithPrefixCount); #endif #if ROCPRIM_TEST_TYPE_SLICE == 0 diff --git a/test/rocprim/test_block_radix_rank.hpp b/test/rocprim/test_block_radix_rank.hpp index e078fdcfa..e3b2b7b4f 100644 --- a/test/rocprim/test_block_radix_rank.hpp +++ b/test/rocprim/test_block_radix_rank.hpp @@ -59,8 +59,13 @@ static constexpr size_t n_sizes = 12; static constexpr unsigned int items_per_thread[n_sizes] = {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3}; static constexpr unsigned int rank_desc[n_sizes] = {false, false, false, false, false, false, true, true, true, true, true, true}; +static constexpr unsigned int use_storage[n_sizes] + = {false, true, false, true, false, true, false, true, false, true, false, true}; +static constexpr unsigned int end_bits[n_sizes] + = {0x1, 0x3, 0x7, 0xf, 0x1, 0x3, 0x7, 0xf, 0x1, 0x3, 0x7, 0xf}; static constexpr unsigned int pass_start_bit[n_sizes] = {0, 0, 0, 6, 2, 1, 0, 0, 0, 1, 4, 7}; static constexpr unsigned int max_radix_bits[n_sizes] = {4, 3, 5, 3, 1, 5, 4, 2, 4, 3, 1, 2}; +static constexpr unsigned int max_radix_bits_extractor[n_sizes] = {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}; static constexpr unsigned int pass_radix_bits[n_sizes] = {0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 1}; template +__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const items_input, + unsigned int* const ranks_output, + const bool descending, + const bool use_storage, + const unsigned int last_bits) +{ + using block_rank_type = rocprim::block_radix_rank; + using keys_exchange_type = rocprim::block_exchange; + using ranks_exchange_type = rocprim::block_exchange; + + constexpr bool warp_striped = Algorithm == rocprim::block_radix_rank_algorithm::match; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + + ROCPRIM_SHARED_MEMORY union + { + typename keys_exchange_type::storage_type keys_exchange; + typename block_rank_type::storage_type rank; + typename ranks_exchange_type::storage_type ranks_exchange; + } storage; + + T keys[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + + rocprim::block_load_direct_blocked(lid, items_input + block_offset, keys); + if ROCPRIM_IF_CONSTEXPR(warp_striped) + { + // block_radix_rank_match requires warp striped input and output. Instead of using + // rocprim::block_load_direct_warp_striped though, we load directly and exchange the + // values manually, as we can also test with block sizes that do not divide the hardware + // warp size that way. + keys_exchange_type().blocked_to_warp_striped(keys, keys, storage.keys_exchange); + rocprim::syncthreads(); + } + + union converter{ + T in; + uint64_t out; + }; + + if(descending) + { + if (use_storage) + block_rank_type().rank_keys_desc(keys, ranks, storage.rank, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); + else + block_rank_type().rank_keys_desc(keys, ranks, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); + } + else + { + if (use_storage) + block_rank_type().rank_keys(keys, ranks, storage.rank, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); + else + block_rank_type().rank_keys(keys, ranks, [=](const T & key){ + converter c; + c.in = key; + uint64_t out = c.out & last_bits; + return out; + }); + } + + if ROCPRIM_IF_CONSTEXPR(warp_striped) + { + // See the comment above. + rocprim::syncthreads(); + ranks_exchange_type().warp_striped_to_blocked(ranks, ranks, storage.ranks_exchange); + } + rocprim::block_store_direct_blocked(lid, ranks_output + block_offset, ranks); +} + +template +__global__ __launch_bounds__(BlockSize) void rank_kernel(const T* const items_input, + unsigned int* const ranks_output, + unsigned int* prefix_output, + unsigned int* counts_output, + Extractor digit_extractor + ) +{ + using block_rank_type = rocprim::block_radix_rank; + using keys_exchange_type = rocprim::block_exchange; + using ranks_exchange_type = rocprim::block_exchange; + + constexpr bool warp_striped = Algorithm == rocprim::block_radix_rank_algorithm::match; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; + + ROCPRIM_SHARED_MEMORY union + { + typename keys_exchange_type::storage_type keys_exchange; + typename block_rank_type::storage_type rank; + typename ranks_exchange_type::storage_type ranks_exchange; + } storage; + + T keys[ItemsPerThread]; + unsigned int ranks[ItemsPerThread]; + + const unsigned int digits_per_thread = block_rank_type().digits_per_thread; + + unsigned int prefix[digits_per_thread]; + unsigned int counts[digits_per_thread]; + rocprim::block_load_direct_blocked(lid, items_input + block_offset, keys); + if ROCPRIM_IF_CONSTEXPR(warp_striped) + { + // block_radix_rank_match requires warp striped input and output. Instead of using + // rocprim::block_load_direct_warp_striped though, we load directly and exchange the + // values manually, as we can also test with block sizes that do not divide the hardware + // warp size that way. + keys_exchange_type().blocked_to_warp_striped(keys, keys, storage.keys_exchange); + rocprim::syncthreads(); + } + + block_rank_type().rank_keys(keys, ranks, storage.rank, digit_extractor, prefix, counts); + + if ROCPRIM_IF_CONSTEXPR(warp_striped) + { + // See the comment above. + rocprim::syncthreads(); + ranks_exchange_type().warp_striped_to_blocked(ranks, ranks, storage.ranks_exchange); + } + rocprim::block_store_direct_blocked(lid, ranks_output + block_offset, ranks); + + // storing count and prefix output + const size_t pc_offset = (threadIdx.x * digits_per_thread) + (blockIdx.x * (1 << RadixBits)); + + for(size_t i = 0; i < digits_per_thread; i++){ + if((threadIdx.x * digits_per_thread) + i < (1 << RadixBits)){ + prefix_output[pc_offset + i] = prefix[i]; + counts_output[pc_offset + i] = counts[i]; + } + } +} + template void test_block_radix_rank() { @@ -141,6 +315,7 @@ void test_block_radix_rank() constexpr size_t radix_bits = RadixBits; constexpr size_t end_bit = start_bit + radix_bits; constexpr bool descending = Descending; + constexpr bool use_storage = UseStorage; constexpr rocprim::block_radix_rank_algorithm algorithm = Algorithm; const size_t grid_size = 23; @@ -205,6 +380,7 @@ void test_block_radix_rank() d_keys_input.get(), d_ranks_output.get(), descending, + use_storage, start_bit, radix_bits); HIP_CHECK(hipGetLastError()); @@ -216,6 +392,259 @@ void test_block_radix_rank() } } +template +void test_block_radix_extractor_rank() +{ + constexpr size_t block_size = BlockSize; + constexpr size_t items_per_thread = ItemsPerThread; + constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t max_radix_bits = MaxRadixBits; + constexpr size_t end_bits = EndBits; + constexpr bool descending = Descending; + constexpr bool use_storage = UseStorage; + constexpr rocprim::block_radix_rank_algorithm algorithm = Algorithm; + + const size_t grid_size = 2; + const size_t size = items_per_block * grid_size; + + SCOPED_TRACE(testing::Message() << "with block_size = " << block_size); + SCOPED_TRACE(testing::Message() << "with items_per_thread = " << items_per_thread); + SCOPED_TRACE(testing::Message() << "with descending = " << (descending ? "true" : "false")); + SCOPED_TRACE(testing::Message() << "with max_radix_bits = " << MaxRadixBits); + SCOPED_TRACE(testing::Message() << "with grid_size = " << size); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + for(size_t seed_index = 0; seed_index < number_of_runs; ++seed_index) + { + seed_type seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector keys_input + = test_utils::get_random_data_wrapped(size, + common::generate_limits::min(), + common::generate_limits::max(), + seed_value); + + + union converter{ + T in; + uint64_t out; + }; + // Calculated expected results on host + std::vector expected(size); + for(size_t i = 0; i < grid_size; ++i) + { + size_t block_offset = i * items_per_block; + + // Perform an 'argsort', which gives a sorted sequence of indices into `keys_input`. + std::vector indices(items_per_block); + std::iota(indices.begin(), indices.end(), 0); + + std::stable_sort( + indices.begin(), + indices.end(), + [&](const int& i, const int& j) + { + converter c; + c.in = keys_input[block_offset + i]; + uint64_t left = c.out & end_bits; + + c.in = keys_input[block_offset + j]; + + uint64_t right = c.out & end_bits; + + return descending ? right < left : left < right; + }); + + // Invert the sorted indices sequence to obtain the ranks. + for(size_t j = 0; j < items_per_block; ++j) + { + expected[block_offset + indices[j]] = static_cast(j); + } + } + + common::device_ptr d_keys_input(keys_input); + common::device_ptr d_ranks_output(size); + + // Running kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + rank_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + d_keys_input.get(), + d_ranks_output.get(), + descending, + use_storage, + end_bits); + HIP_CHECK(hipGetLastError()); + + // Getting results to host + auto ranks_output = d_ranks_output.load(); + + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(ranks_output, expected)); + } +} + +template +void test_block_radix_rank_with_prefix_and_count() +{ + constexpr size_t block_size = BlockSize; + constexpr size_t items_per_thread = ItemsPerThread; + constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t max_radix_bits = MaxRadixBits; + constexpr size_t radix_bits = RadixBits; + constexpr rocprim::block_radix_rank_algorithm algorithm = Algorithm; + + const size_t grid_size = 2; + const size_t size = items_per_block * grid_size; + const size_t pc_items_per_block = (1 << radix_bits); + const uint64_t end_bits = ((pc_items_per_block) - 1); + const size_t pc_size = (pc_items_per_block) * grid_size; + + SCOPED_TRACE(testing::Message() << "with block_size = " << block_size); + SCOPED_TRACE(testing::Message() << "with items_per_thread = " << items_per_thread); + SCOPED_TRACE(testing::Message() << "with max_radix_bits = " << MaxRadixBits); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + for(size_t seed_index = 0; seed_index < number_of_runs; ++seed_index) + { + seed_type seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector keys_input + = test_utils::get_random_data_wrapped(size, + common::generate_limits::min(), + common::generate_limits::max(), + seed_value); + + + union converter{ + T in; + uint64_t out; + } c; + // Calculated expected results on host + std::vector expected(size); + std::vector expected_histogram(pc_size, 0); + std::vector expected_prefix(pc_size, 0); + for(size_t i = 0; i < grid_size; ++i) + { + size_t block_offset = i * items_per_block; + + // Perform an 'argsort', which gives a sorted sequence of indices into `keys_input`. + std::vector indices(items_per_block); + std::iota(indices.begin(), indices.end(), 0); + + std::stable_sort( + indices.begin(), + indices.end(), + [&](const int& i, const int& j) + { + c.in = keys_input[block_offset + i]; + uint64_t left = c.out & end_bits; + + c.in = keys_input[block_offset + j]; + + uint64_t right = c.out & end_bits; + + return Descending ? right < left : left < right; + }); + + // Invert the sorted indices sequence to obtain the ranks. + for(size_t j = 0; j < items_per_block; ++j) + { + expected[block_offset + indices[j]] = static_cast(j); + } + + size_t pc_block_offset = i * (pc_items_per_block); + for(size_t j = 0; j < items_per_block; j++){ + c.in = keys_input[block_offset + j]; + uint64_t bit_rep = c.out; + bit_rep &= end_bits; + + if(Descending) + bit_rep = pc_items_per_block - (1 + bit_rep); + + ++expected_histogram[bit_rep + pc_block_offset]; + } + std::exclusive_scan( + expected_histogram.begin() + pc_block_offset, + expected_histogram.begin() + pc_block_offset + pc_items_per_block, + expected_prefix.begin() + pc_block_offset, + 0 + ); + } + common::device_ptr d_keys_input(keys_input); + common::device_ptr d_ranks_output(size); + common::device_ptr d_prefix_output(pc_size); + common::device_ptr d_counts_output(pc_size); + + // Running kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + rank_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + d_keys_input.get(), + d_ranks_output.get(), + d_prefix_output.get(), + d_counts_output.get(), + [] (const T & key){ + const uint64_t end_bits = ((pc_items_per_block) - 1); + union converter{ + T in; + uint64_t out; + } c; + c.in = key; + + uint64_t out = c.out & end_bits; + + if(Descending) + out = pc_items_per_block - (1 + out); + + return out; + } + ); + HIP_CHECK(hipGetLastError()); + + // Getting results to host + auto ranks_output = d_ranks_output.load(); + auto prefix_output = d_prefix_output.load(); + auto counts_output = d_counts_output.load(); + + for(size_t i = 0; i < ranks_output.size(); i++){ + ASSERT_EQ(ranks_output[i], expected[i]) << "Index: " << i << std::endl; + + if(i < pc_size){ + ASSERT_EQ(prefix_output[i], expected_prefix[i]) << "Index: " << i << std::endl; + ASSERT_EQ(counts_output[i], expected_histogram[i]) << "Index: " << i << std::endl; + } + } + } +} + template(); } static_for::run(); } + + static void run_extractor() + { + { + SCOPED_TRACE(testing::Message() << "TestID = " << First); + test_block_radix_extractor_rank(); + } + static_for::run_extractor(); + } + + static void run_extractor_with_prefix_count() + { + { + SCOPED_TRACE(testing::Message() << "TestID = " << First); + test_block_radix_rank_with_prefix_and_count(); + } + static_for::run_extractor_with_prefix_count(); + } }; template { static void run() {} + static void run_extractor() {} + static void run_extractor_with_prefix_count() {} }; template @@ -266,4 +729,32 @@ void test_block_radix_rank_algorithm() static_for<0, n_sizes, type, block_size, Algorithm>::run(); } +template +void test_block_radix_rank_extractor_algorithm() +{ + using type = typename TestFixture::params::input_type; + constexpr size_t block_size = TestFixture::params::block_size; + + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + + static_for<0, n_sizes, type, block_size, Algorithm>::run_extractor(); +} + +template +void test_block_radix_rank_extractor_with_prefix_count_algorithm() +{ + using type = typename TestFixture::params::input_type; + constexpr size_t block_size = TestFixture::params::block_size; + + if(block_size > test_utils::get_max_block_size()) + { + GTEST_SKIP(); + } + + static_for<0, n_sizes, type, block_size, Algorithm>::run_extractor_with_prefix_count(); +} + #endif // TEST_BLOCK_RADIX_RANK_KERNELS_HPP_ diff --git a/test/rocprim/test_block_scan.hpp b/test/rocprim/test_block_scan.hpp index 32ac8c51f..97704a26c 100644 --- a/test/rocprim/test_block_scan.hpp +++ b/test/rocprim/test_block_scan.hpp @@ -664,6 +664,34 @@ typed_test_def(suite_name_array, name_suffix, InclusiveScanPrefixCallback) static_for_input_array<0, 2, T, 2, block_size>::run(); } +typed_test_def(suite_name_array, name_suffix, InclusiveScanWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 6, block_size>::run(); +} +typed_test_def(suite_name_array, name_suffix, InclusiveScanWithStorageAndInit) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 7, block_size>::run(); +} +typed_test_def(suite_name_array, name_suffix, InclusiveScanReduceWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 8, block_size>::run(); +} +typed_test_def(suite_name_array, name_suffix, InclusiveScanReduceWithStorageAndInit) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 9, block_size>::run(); +} typed_test_def(suite_name_array, name_suffix, ExclusiveScan) { using T = typename TestFixture::input_type; @@ -687,3 +715,19 @@ typed_test_def(suite_name_array, name_suffix, ExclusiveScanPrefixCallback) static_for_input_array<0, 2, T, 5, block_size>::run(); } + +typed_test_def(suite_name_array, name_suffix, ExclusiveScanWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 10, block_size>::run(); +} + +typed_test_def(suite_name_array, name_suffix, ExclusiveScanReduceWithStorage) +{ + using T = typename TestFixture::input_type; + constexpr size_t block_size = TestFixture::block_size; + + static_for_input_array<0, 2, T, 11, block_size>::run(); +} \ No newline at end of file diff --git a/test/rocprim/test_block_scan.kernels.hpp b/test/rocprim/test_block_scan.kernels.hpp index 4eca2c418..79cd7395b 100644 --- a/test/rocprim/test_block_scan.kernels.hpp +++ b/test/rocprim/test_block_scan.kernels.hpp @@ -400,6 +400,138 @@ void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_out } } +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_array_with_storage_kernel(T * device_output){ + + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + + T input[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++) + input[i] = device_output[i + index]; + + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + + bscan.inclusive_scan(input, input, storage, BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++) + device_output[i + index] = input[i]; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_array_with_storage_kernel(T * device_output, T init){ + + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + + T input[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++) + input[i] = device_output[i + index]; + + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + + bscan.inclusive_scan(input, init, input, storage, BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++) + device_output[i + index] = input[i]; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions) +{ + const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + T reduction; + bscan.inclusive_scan(in_out, in_out, reduction, storage, BinaryOp()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(threadIdx.x == 0) + { + device_output_reductions[blockIdx.x] = reduction; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void inclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions, T init) +{ + const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread; + + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + T reduction; + bscan.inclusive_scan(in_out, init, in_out, reduction, storage, BinaryOp()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(threadIdx.x == 0) + { + device_output_reductions[blockIdx.x] = reduction; + } +} + template< unsigned int BlockSize, unsigned int ItemsPerThread, @@ -511,6 +643,72 @@ void exclusive_scan_prefix_callback_array_kernel( } } +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void exclusive_scan_array_with_storage_kernel(T * device_output, T init){ + + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + + T input[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++) + input[i] = device_output[i + index]; + + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + + bscan.exclusive_scan(input, input, init, storage, BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++) + device_output[i + index] = input[i]; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + rocprim::block_scan_algorithm Algorithm, + class T, + class BinaryOp +> +__global__ +__launch_bounds__(BlockSize) +void exclusive_scan_reduce_array_with_storage_kernel(T* device_output, T* device_output_reductions, T init) +{ + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + // load + T in_out[ItemsPerThread]; + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + in_out[j] = device_output[index + j]; + } + + using bscan_t = rocprim::block_scan; + __shared__ typename bscan_t::storage_type storage; + rocprim::block_scan bscan; + T reduction; + bscan.exclusive_scan(in_out, in_out, init, reduction, storage, BinaryOp()); + + // store + for(unsigned int j = 0; j < ItemsPerThread; j++) + { + device_output[index + j] = in_out[j]; + } + + if(threadIdx.x == 0) + { + device_output_reductions[blockIdx.x] = reduction; + } +} + // Test for scan template< class T, @@ -772,7 +970,7 @@ template< rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan > auto test_block_scan_input_arrays() --> typename std::enable_if::type +-> typename std::enable_if::type { using binary_op_type = rocprim::maximum; @@ -787,8 +985,8 @@ auto test_block_scan_input_arrays() } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 19; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); SCOPED_TRACE(testing::Message() << "with size = " << size); @@ -801,18 +999,16 @@ auto test_block_scan_input_arrays() // Generate data std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); - const T init = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host std::vector expected(output.size(), T(0)); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { - expected[i * items_per_block] = init; - for(size_t j = 1; j < items_per_block; j++) + for(size_t j = 0; j < items_per_block; j++) { auto idx = i * items_per_block + j; - expected[idx] = binary_op(output[idx-1], expected[idx-1]); + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); } } @@ -820,7 +1016,7 @@ auto test_block_scan_input_arrays() common::device_ptr device_output(output); // Launching kernel - hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_array_kernel auto test_block_scan_input_arrays() --> typename std::enable_if::type +-> typename std::enable_if::type { using binary_op_type = rocprim::maximum; @@ -878,63 +1072,50 @@ auto test_block_scan_input_arrays() { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - + // Generate data std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); - - // Output reduce results - std::vector output_reductions(size / items_per_block); - const T init = test_utils::get_random_value(0, 100, seed_value); - + // Calculate expected results on host std::vector expected(output.size(), T(0)); - std::vector expected_reductions(output_reductions.size(), T(0)); + const T init = test_utils::get_random_value(0, 100, seed_value); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { expected[i * items_per_block] = init; - for(size_t j = 1; j < items_per_block; j++) - { - auto idx = i * items_per_block + j; - expected[idx] = binary_op(output[idx-1], expected[idx-1]); - } for(size_t j = 0; j < items_per_block; j++) { auto idx = i * items_per_block + j; - expected_reductions[i] = binary_op(expected_reductions[i], output[idx]); + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); } } // Writing to device memory common::device_ptr device_output(output); - common::device_ptr device_output_reductions(output_reductions.size()); // Launching kernel - hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_array_kernel), - dim3(grid_size), - dim3(block_size), - 0, - 0, - device_output.get(), - device_output_reductions.get(), - init); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + init + ); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - output = device_output.load(); - output_reductions = device_output_reductions.load(); + output = device_output.load(); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); } - } template< @@ -945,7 +1126,7 @@ template< rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan > auto test_block_scan_input_arrays() --> typename std::enable_if::type +-> typename std::enable_if::type { using binary_op_type = rocprim::maximum; @@ -974,53 +1155,403 @@ auto test_block_scan_input_arrays() // Generate data std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); - std::vector output_block_prefixes(size / items_per_block); - T block_prefix = test_utils::get_random_value(0, 100, seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, T(0)); // Calculate expected results on host std::vector expected(output.size(), T(0)); - std::vector expected_block_prefixes(output_block_prefixes.size(), T(0)); + std::vector expected_reductions(output_reductions.size(), T(0)); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { - expected[i * items_per_block] = block_prefix; - for(size_t j = 1; j < items_per_block; j++) - { - auto idx = i * items_per_block + j; - expected[idx] = binary_op(output[idx-1], expected[idx-1]); - } - expected_block_prefixes[i] = block_prefix; for(size_t j = 0; j < items_per_block; j++) { auto idx = i * items_per_block + j; - expected_block_prefixes[i] = binary_op(expected_block_prefixes[i], output[idx]); + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); } + expected_reductions[i] = expected[(i+1) * items_per_block - 1]; } // Writing to device memory common::device_ptr device_output(output); - common::device_ptr device_output_bp(output_block_prefixes.size()); + common::device_ptr device_output_reductions(output_reductions); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel), - dim3(grid_size), - dim3(block_size), - 0, - 0, - device_output.get(), - device_output_bp.get(), - block_prefix); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_reductions.get()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - output = device_output.load(); + output = device_output.load(); + output_reductions = device_output_reductions.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + } + +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Output reduce results + std::vector output_reductions(size / block_size, T(0)); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + std::vector expected_reductions(output_reductions.size(), T(0)); + const T init = test_utils::get_random_value(0, 100, seed_value); + + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx], expected[j > 0 ? idx-1 : idx]); + } + expected_reductions[i] = expected[(i+1) * items_per_block - 1]; + } + + // Writing to device memory + common::device_ptr device_output(output); + common::device_ptr device_output_reductions(output_reductions); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_reductions.get(), + init + ); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + output_reductions = device_output_reductions.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + } + +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + const T init = test_utils::get_random_value(0, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx-1], expected[idx-1]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + init); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } + +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Output reduce results + std::vector output_reductions(size / items_per_block); + const T init = test_utils::get_random_value(0, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + std::vector expected_reductions(output_reductions.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx-1], expected[idx-1]); + } + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected_reductions[i] = binary_op(expected_reductions[i], output[idx]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + common::device_ptr device_output_reductions(output_reductions.size()); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_reductions.get(), + init); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + output_reductions = device_output_reductions.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + } + +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + std::vector output_block_prefixes(size / items_per_block); + T block_prefix = test_utils::get_random_value(0, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + std::vector expected_block_prefixes(output_block_prefixes.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = block_prefix; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx-1], expected[idx-1]); + } + expected_block_prefixes[i] = block_prefix; + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected_block_prefixes[i] = binary_op(expected_block_prefixes[i], output[idx]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + common::device_ptr device_output_bp(output_block_prefixes.size()); + + // Launching kernel + hipLaunchKernelGGL( + HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_bp.get(), + block_prefix); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); output_block_prefixes = device_output_bp.load(); // Validating results @@ -1031,6 +1562,179 @@ auto test_block_scan_input_arrays() } +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + const T init = test_utils::get_random_value(0, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx-1], expected[idx-1]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + init); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + } + +} + +template< + class T, + int Method, + unsigned int BlockSize = 256U, + unsigned int ItemsPerThread = 1U, + rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan +> +auto test_block_scan_input_arrays() +-> typename std::enable_if::type +{ + using binary_op_type = rocprim::maximum; + + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + + // Given block size not supported + if(block_size > test_utils::get_max_block_size()) + { + return; + } + + const size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 19; + const size_t grid_size = size / items_per_block; + + SCOPED_TRACE(testing::Message() << "with items_per_block = " << items_per_block); + SCOPED_TRACE(testing::Message() << "with size = " << size); + SCOPED_TRACE(testing::Message() << "with grid_size = " << grid_size); + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + // Generate data + std::vector output = test_utils::get_random_data_wrapped(size, 2, 100, seed_value); + + // Output reduce results + std::vector output_reductions(size / items_per_block); + const T init = test_utils::get_random_value(0, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output.size(), T(0)); + std::vector expected_reductions(output_reductions.size(), T(0)); + binary_op_type binary_op; + for(size_t i = 0; i < output.size() / items_per_block; i++) + { + expected[i * items_per_block] = init; + for(size_t j = 1; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected[idx] = binary_op(output[idx-1], expected[idx-1]); + } + for(size_t j = 0; j < items_per_block; j++) + { + auto idx = i * items_per_block + j; + expected_reductions[i] = binary_op(expected_reductions[i], output[idx]); + } + } + + // Writing to device memory + common::device_ptr device_output(output); + common::device_ptr device_output_reductions(output_reductions.size()); + + // Launching kernel + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_array_with_storage_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output.get(), + device_output_reductions.get(), + init); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Read from device memory + output = device_output.load(); + output_reductions = device_output_reductions.load(); + + // Validating results + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); + ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + } + +} + // Static for-loop template struct static_for_input_array diff --git a/test/rocprim/test_block_sort.hpp b/test/rocprim/test_block_sort.hpp index 89fa0fde6..f57543a83 100644 --- a/test/rocprim/test_block_sort.hpp +++ b/test/rocprim/test_block_sort.hpp @@ -333,6 +333,287 @@ void TestSortStableKey(std::vector sizes) } } } + +template +void TestSortKeyNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + size_t grid_size = 1134; + size_t size = items_per_block * grid_size; + SCOPED_TRACE(testing::Message() << "with size = " << size); + // Generate data + std::vector output + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output); + binary_op_type binary_op; + for(size_t i = 0; i < grid_size; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + std::min(size, ((i + 1) * items_per_block)), + binary_op); + } + + // Preparing device + common::device_ptr device_key_output(output); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_keys_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get() + ); + + // Reading results back + output = device_key_output.load(); + + test_utils::assert_eq(output, expected); + } +} + +template +void TestSortKeyValueNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get()); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + +template +void TestSortKeyValueWithSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_with_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get(), + items_per_block); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + #endif // TEST_ROCPRIM_TEST_BLOCK_SORT_HPP_ // This file is included multiple times in the test_block_sort_[algo].cpp file, because // the test definitions below this header guard need to be compiled for each test suites: @@ -429,7 +710,7 @@ typed_test_def(suite_name, static constexpr const unsigned int block_size = TestFixture::block_size; static constexpr const unsigned int items_per_thread = 4; std::vector sizes - = {0, 53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; + = {0,53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; TestSortKey(sizes); } @@ -443,3 +724,69 @@ typed_test_def(suite_name, name_suffix, SortKeyValueDesc) static constexpr const unsigned int items_per_thread = 1; TestSortKeyValue(); } + +typed_test_def(suite_name, name_suffix, SortKeyNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueWithSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueWithSize(); +} diff --git a/test/rocprim/test_block_sort.kernels.hpp b/test/rocprim/test_block_sort.kernels.hpp index d595471d3..6aab8eaee 100644 --- a/test/rocprim/test_block_sort.kernels.hpp +++ b/test/rocprim/test_block_sort.kernels.hpp @@ -237,4 +237,256 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(key_type* /*keys* OffsetT /*size*/) {} +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_key = keys[index]; + + bsort_type().sort(thread_key, + storage, + BinaryOp()); + + keys[index] = thread_key; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_keys[ItemsPerThread]; + for(size_t i = 0; i < ItemsPerThread; i++) + thread_keys[i] = keys[index + i]; + // + bsort_type().sort(thread_keys, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++) + keys[index + i] = thread_keys[i]; +} + +template::value_type>, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator) +{} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + key_type thread_key = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_key, + thread_value, + storage, + BinaryOp() + ); + keys[index] = thread_key; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + bsort_type().sort(thread_keys, + thread_value, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{} + + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + keys[index] = thread_keys; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, const unsigned int size) +{} #endif // TEST_BLOCK_SORT_KERNELS_HPP_ diff --git a/test/rocprim/test_utils_types.hpp b/test/rocprim/test_utils_types.hpp index 25b97fb9f..ad1f57e27 100644 --- a/test/rocprim/test_utils_types.hpp +++ b/test/rocprim/test_utils_types.hpp @@ -271,8 +271,18 @@ static constexpr unsigned int items[n_items] = { static constexpr unsigned int block_size = Params::block_size; \ }; +#define block_store_test_suite_type_def_helper(name, suffix) \ + template \ + class name ## suffix : public ::testing::Test { \ + public: \ + using DataType = typename Params::input_type; \ + static constexpr unsigned int block_size = Params::block_size; \ + }; + #define block_sort_test_suite_type_def(name, suffix) block_sort_test_suite_type_def_helper(name, suffix) +#define block_store_test_suite_type_def(name, suffix) block_store_test_suite_type_def_helper(name, suffix) + #define typed_test_suite_def_helper(name, suffix, params) TYPED_TEST_SUITE(name ## suffix, params) #define typed_test_suite_def(name, suffix, params) typed_test_suite_def_helper(name, suffix, params)