diff --git a/CHANGELOG.md b/CHANGELOG.md index ea472afa0..b39c9af6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added additional unit tests for `test_block_scan.hpp` +* Added additional unit tests for `test_block_sort.hpp` ### Changed diff --git a/test/rocprim/test_block_sort.hpp b/test/rocprim/test_block_sort.hpp index 89fa0fde6..f57543a83 100644 --- a/test/rocprim/test_block_sort.hpp +++ b/test/rocprim/test_block_sort.hpp @@ -333,6 +333,287 @@ void TestSortStableKey(std::vector sizes) } } } + +template +void TestSortKeyNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + size_t grid_size = 1134; + size_t size = items_per_block * grid_size; + SCOPED_TRACE(testing::Message() << "with size = " << size); + // Generate data + std::vector output + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Calculate expected results on host + std::vector expected(output); + binary_op_type binary_op; + for(size_t i = 0; i < grid_size; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + std::min(size, ((i + 1) * items_per_block)), + binary_op); + } + + // Preparing device + common::device_ptr device_key_output(output); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_keys_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get() + ); + + // Reading results back + output = device_key_output.load(); + + test_utils::assert_eq(output, expected); + } +} + +template +void TestSortKeyValueNoSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_no_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get()); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + +template +void TestSortKeyValueWithSize() +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + static constexpr const unsigned int items_per_block = block_size * items_per_thread; + static constexpr const size_t grid_size = 1134; + static constexpr const size_t size = items_per_block * grid_size; + hipStream_t stream = 0; // default + + if(!is_buildable(block_size, items_per_thread, algo)) + { + GTEST_SKIP(); + } + + for(size_t seed_index = 0; seed_index < number_of_runs; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector output_key + = test_utils::get_random_data_wrapped(size, 0, 100, seed_value); + std::vector output_value + = test_utils::get_random_data_wrapped(size, -100, 100, seed_value); + + // Combine vectors to form pairs with key and value + std::vector> target(size); + for(unsigned i = 0; i < target.size(); i++) + target[i] = std::make_pair(output_key[i], output_value[i]); + + // Calculate expected results on host + using key_value = std::pair; + std::vector expected(target); + constexpr bool descending = !std::is_same>::value; + for(size_t i = 0; i < expected.size() / items_per_block; i++) + { + std::sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + test_utils::key_value_comparator()); + } + + // Preparing device + common::device_ptr device_key_output(output_key); + common::device_ptr device_value_output(output_value); + + // Running kernel, ignored if invalid size + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel_with_size), + dim3(grid_size), + dim3(block_size), + 0, + stream, + device_key_output.get(), + device_value_output.get(), + items_per_block); + HIP_CHECK(hipGetLastError()); + + // Reading results back + output_key = device_key_output.load(); + output_value = device_value_output.load(); + + std::vector expected_key(expected.size()); + std::vector expected_value(expected.size()); + for(size_t i = 0; i < expected.size(); i++) + { + expected_key[i] = expected[i].first; + expected_value[i] = expected[i].second; + } + + // Keys are sorted, Values order not guaranteed + // Sort subsets where key was the same to make sure all values are still present + using value_op_type = rocprim::less; + using eq_op_type = rocprim::equal_to; + value_op_type value_op; + eq_op_type eq_op; + for(size_t i = 0; i < output_key.size();) + { + auto j = i; + for(; j < output_key.size() && eq_op(output_key[j], output_key[i]); ++j) + {} + std::sort(output_value.begin() + i, output_value.begin() + j, value_op); + std::sort(expected_value.begin() + i, expected_value.begin() + j, value_op); + i = j; + } + + test_utils::assert_eq(output_key, expected_key); + test_utils::assert_eq(output_value, expected_value); + } +} + #endif // TEST_ROCPRIM_TEST_BLOCK_SORT_HPP_ // This file is included multiple times in the test_block_sort_[algo].cpp file, because // the test definitions below this header guard need to be compiled for each test suites: @@ -429,7 +710,7 @@ typed_test_def(suite_name, static constexpr const unsigned int block_size = TestFixture::block_size; static constexpr const unsigned int items_per_thread = 4; std::vector sizes - = {0, 53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; + = {0,53, 512, 5000, 34567, (1 << 17) - 1220, 1134 * 256, (1 << 20) - 123}; TestSortKey(sizes); } @@ -443,3 +724,69 @@ typed_test_def(suite_name, name_suffix, SortKeyValueDesc) static constexpr const unsigned int items_per_thread = 1; TestSortKeyValue(); } + +typed_test_def(suite_name, name_suffix, SortKeyNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::greater; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueNoSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueNoSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSize) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 1; + TestSortKeyValueWithSize(); +} + +typed_test_def(suite_name, name_suffix, SortKeyValueWithSizeMultipleItemsPerThread) +{ + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using binary_op_type = typename rocprim::less; + static constexpr const rocprim::block_sort_algorithm algo = TEST_BLOCK_SORT_ALGORITHM; + static constexpr const unsigned int block_size = TestFixture::block_size; + static constexpr const unsigned int items_per_thread = 4; + TestSortKeyValueWithSize(); +} diff --git a/test/rocprim/test_block_sort.kernels.hpp b/test/rocprim/test_block_sort.kernels.hpp index d595471d3..6aab8eaee 100644 --- a/test/rocprim/test_block_sort.kernels.hpp +++ b/test/rocprim/test_block_sort.kernels.hpp @@ -237,4 +237,256 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(key_type* /*keys* OffsetT /*size*/) {} +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_key = keys[index]; + + bsort_type().sort(thread_key, + storage, + BinaryOp()); + + keys[index] = thread_key; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class KeyIterator, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less::value_type>, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator keys) +{ + using key_type = typename std::iterator_traits::value_type; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + threadIdx.x * ItemsPerThread; + + using bsort_type + = rocprim::block_sort; + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + key_type thread_keys[ItemsPerThread]; + for(size_t i = 0; i < ItemsPerThread; i++) + thread_keys[i] = keys[index + i]; + // + bsort_type().sort(thread_keys, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++) + keys[index + i] = thread_keys[i]; +} + +template::value_type>, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_keys_kernel_no_size(KeyIterator) +{} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + key_type thread_key = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_key, + thread_value, + storage, + BinaryOp() + ); + keys[index] = thread_key; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + bsort_type().sort(thread_keys, + thread_value, + storage, + BinaryOp()); + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_no_size(key_type* keys, + value_type* values) +{} + + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread == 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys = keys[index]; + value_type thread_value = values[index]; + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + keys[index] = thread_keys; + values[index] = thread_value; +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t<(ItemsPerThread > 1u && is_buildable(BlockSize, ItemsPerThread, algorithm)), + int> + = 0> + +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, + const unsigned int size + ) +{ + using bsort_type + = rocprim::block_sort; + + ROCPRIM_SHARED_MEMORY typename bsort_type::storage_type storage; + + static constexpr const unsigned int ItemsPerBlock = ItemsPerThread * BlockSize; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; + const unsigned int index = block_offset + (threadIdx.x * ItemsPerThread); + + key_type thread_keys[ItemsPerThread]; + value_type thread_value[ItemsPerThread]; + + for(size_t i = 0; i < ItemsPerThread; i++){ + thread_keys[i] = keys[index + i]; + thread_value[i] = values[index + i]; + } + + bsort_type().sort(thread_keys, + thread_value, + storage, + size, + BinaryOp()); + + for(size_t i = 0; i < ItemsPerThread; i++){ + keys[index + i] = thread_keys[i]; + values[index + i] = thread_value[i]; + } +} + +template< + unsigned int BlockSize, + unsigned int ItemsPerThread, + class key_type, + class value_type, + rocprim::block_sort_algorithm algorithm, + class BinaryOp = rocprim::less, + std::enable_if_t = 0> +__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel_with_size(key_type* keys, + value_type* values, const unsigned int size) +{} #endif // TEST_BLOCK_SORT_KERNELS_HPP_