diff --git a/.githooks/pre-commit b/.githooks/pre-commit index 7ca327ad5..aea1a695e 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -4,7 +4,7 @@ # are installed, and if so, uses the installed version to format # the staged changes. -base=clang-format-3.8 +base=/opt/rocm/hcc/bin/clang-format format="" # Redirect output to stderr. @@ -16,8 +16,8 @@ type "$base" >/dev/null 2>&1 && format="$base" # no versions of clang-format are installed if [ -z "$format" ] then - echo "$base is not installed. Commit is cancelled. Delete the hook to force commit." - exit 1 + echo "$base is not installed. Pre-commit hook will not be executed." + exit 0 fi # Do everything from top - level @@ -31,28 +31,13 @@ else against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 fi -exitCode=0 - # do the formatting for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') do if [ -e "$file" ] then - # echo "$format $file" - "$format" -style=file -output-replacements-xml "$file" | grep "\" >/dev/null - if [ $? -ne 1 ]; then - echo "$format" -i -style=file "$file" - exitCode=1 - fi - + echo "$format $file" + "$format" -i -style=file "$file" fi done -if [ $exitCode -eq 1 ]; then - echo "Please fix the errors by running the commands listed above." - echo "Commit is cancelled. You may force by deleting the hook in .githook" -else - echo "Clang format checks passed" -fi - -exit $exitCode diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp index 32b1fe451..50a9fd6f3 100644 --- a/benchmark/benchmark_block_discontinuity.cpp +++ b/benchmark/benchmark_block_discontinuity.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -55,43 +56,37 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; namespace rp = rocprim; -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials -> -__global__ -void kernel(const T * d_input, T * d_output) +template +__global__ void kernel(const T* d_input, T* d_output) { Runner::template run(d_input, d_output); } struct flag_heads { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; + bool head_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_heads(head_flags, T(123), input, rp::equal_to()); @@ -114,27 +109,24 @@ struct flag_heads struct flag_tails { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; - bool tail_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_tails(tail_flags, T(123), input, rp::equal_to()); @@ -157,35 +149,34 @@ struct flag_tails struct flag_heads_and_tails { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; - bool head_flags[ItemsPerThread]; - bool tail_flags[ItemsPerThread]; + bool head_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; if(WithTile) { - bdiscontinuity.flag_heads_and_tails(head_flags, T(123), tail_flags, T(234), input, rp::equal_to()); + bdiscontinuity.flag_heads_and_tails( + head_flags, T(123), tail_flags, T(234), input, rp::equal_to()); } else { - bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, rp::equal_to()); + bdiscontinuity.flag_heads_and_tails( + head_flags, tail_flags, input, rp::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; i++) @@ -200,31 +191,23 @@ struct flag_heads_and_tails } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool WithTile, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = get_random_data(size, T(0), T(10)); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) @@ -233,15 +216,18 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -251,55 +237,41 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ -benchmark::RegisterBenchmark( \ - (std::string("block_discontinuity<" #T ", " #BS ">.") + name + ("<" #IPT ", " #WITH_TILE ">")).c_str(), \ - run_benchmark, \ - stream, size \ -) +#define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ + benchmark::RegisterBenchmark((std::string("block_discontinuity<" #T ", " #BS ">.") + name \ + + ("<" #IPT ", " #WITH_TILE ">")) \ + .c_str(), \ + run_benchmark, \ + stream, \ + size) -template -void add_benchmarks(const std::string& name, +template +void add_benchmarks(const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { - CREATE_BENCHMARK(int, 256, 1, false), - CREATE_BENCHMARK(int, 256, 2, false), - CREATE_BENCHMARK(int, 256, 3, false), - CREATE_BENCHMARK(int, 256, 4, false), - CREATE_BENCHMARK(int, 256, 6, false), - CREATE_BENCHMARK(int, 256, 7, false), - CREATE_BENCHMARK(int, 256, 8, false), - CREATE_BENCHMARK(int, 256, 1, true), - CREATE_BENCHMARK(int, 256, 2, true), - CREATE_BENCHMARK(int, 256, 3, true), - CREATE_BENCHMARK(int, 256, 4, true), - CREATE_BENCHMARK(int, 256, 6, true), - CREATE_BENCHMARK(int, 256, 7, true), - CREATE_BENCHMARK(int, 256, 8, true), - CREATE_BENCHMARK(long long, 256, 1, false), - CREATE_BENCHMARK(long long, 256, 2, false), - CREATE_BENCHMARK(long long, 256, 3, false), - CREATE_BENCHMARK(long long, 256, 4, false), - CREATE_BENCHMARK(long long, 256, 6, false), - CREATE_BENCHMARK(long long, 256, 7, false), - CREATE_BENCHMARK(long long, 256, 8, false), - CREATE_BENCHMARK(long long, 256, 1, true), - CREATE_BENCHMARK(long long, 256, 2, true), - CREATE_BENCHMARK(long long, 256, 3, true), - CREATE_BENCHMARK(long long, 256, 4, true), - CREATE_BENCHMARK(long long, 256, 6, true), - CREATE_BENCHMARK(long long, 256, 7, true), - CREATE_BENCHMARK(long long, 256, 8, true), + std::vector bs = { + CREATE_BENCHMARK(int, 256, 1, false), CREATE_BENCHMARK(int, 256, 2, false), + CREATE_BENCHMARK(int, 256, 3, false), CREATE_BENCHMARK(int, 256, 4, false), + CREATE_BENCHMARK(int, 256, 6, false), CREATE_BENCHMARK(int, 256, 7, false), + CREATE_BENCHMARK(int, 256, 8, false), CREATE_BENCHMARK(int, 256, 1, true), + CREATE_BENCHMARK(int, 256, 2, true), CREATE_BENCHMARK(int, 256, 3, true), + CREATE_BENCHMARK(int, 256, 4, true), CREATE_BENCHMARK(int, 256, 6, true), + CREATE_BENCHMARK(int, 256, 7, true), CREATE_BENCHMARK(int, 256, 8, true), + CREATE_BENCHMARK(long long, 256, 1, false), CREATE_BENCHMARK(long long, 256, 2, false), + CREATE_BENCHMARK(long long, 256, 3, false), CREATE_BENCHMARK(long long, 256, 4, false), + CREATE_BENCHMARK(long long, 256, 6, false), CREATE_BENCHMARK(long long, 256, 7, false), + CREATE_BENCHMARK(long long, 256, 8, false), CREATE_BENCHMARK(long long, 256, 1, true), + CREATE_BENCHMARK(long long, 256, 2, true), CREATE_BENCHMARK(long long, 256, 3, true), + CREATE_BENCHMARK(long long, 256, 4, true), CREATE_BENCHMARK(long long, 256, 6, true), + CREATE_BENCHMARK(long long, 256, 7, true), CREATE_BENCHMARK(long long, 256, 8, true), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -308,13 +280,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp index 1c2059b47..42fe87428 100644 --- a/benchmark/benchmark_block_exchange.cpp +++ b/benchmark/benchmark_block_exchange.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -55,37 +56,28 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; namespace rp = rocprim; -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -void kernel(const T * d_input, T * d_output) +template +__global__ void kernel(const T* d_input, T* d_output) { Runner::template run(d_input, d_output); } struct blocked_to_striped { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -98,22 +90,16 @@ struct blocked_to_striped struct striped_to_blocked { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -126,22 +112,16 @@ struct striped_to_blocked struct blocked_to_warp_striped { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -154,22 +134,16 @@ struct blocked_to_warp_striped struct warp_striped_to_blocked { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -182,24 +156,18 @@ struct warp_striped_to_blocked struct scatter_to_blocked { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; + T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_input + block_offset, ranks); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -212,24 +180,18 @@ struct scatter_to_blocked struct scatter_to_striped { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T * d_input, T * d_output) + template + __device__ static void run(const T* d_input, T* d_output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T input[ItemsPerThread]; + T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_input + block_offset, ranks); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -240,17 +202,15 @@ struct scatter_to_striped } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input(size); // Fill input with ranks (for scatter operations) @@ -261,34 +221,30 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) std::iota(block_ranks, block_ranks + items_per_block, 0); std::shuffle(block_ranks, block_ranks + items_per_block, gen); } - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -298,21 +254,20 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - (std::string("block_exchange<" #T ", " #BS ", " #IPT ">.") + name).c_str(), \ - run_benchmark, \ - stream, size \ -) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_exchange<" #T ", " #BS ", " #IPT ">.") + name).c_str(), \ + run_benchmark, \ + stream, \ + size) -template -void add_benchmarks(const std::string& name, +template +void add_benchmarks(const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { CREATE_BENCHMARK(int, 256, 1), CREATE_BENCHMARK(int, 256, 2), CREATE_BENCHMARK(int, 256, 3), @@ -332,7 +287,7 @@ void add_benchmarks(const std::string& name, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -341,13 +296,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp index 9224072d9..16d161b7d 100644 --- a/benchmark/benchmark_block_histogram.cpp +++ b/benchmark/benchmark_block_histogram.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -55,35 +56,29 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; namespace rp = rocprim; -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize, - unsigned int Trials -> -__global__ -void kernel(const T* input, T* output) +template +__global__ void kernel(const T* input, T* output) { Runner::template run(input, output); } -template +template struct histogram { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) + template + __device__ static void run(const T* input, T* output) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - unsigned int global_offset = hipBlockIdx_x * BinSize; + unsigned int global_offset = hipBlockIdx_x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) @@ -91,18 +86,18 @@ struct histogram values[k] = input[index + k]; } - using bhistogram_t = rp::block_histogram; - __shared__ T histogram[BinSize]; + using bhistogram_t = rp::block_histogram; + __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t().histogram(values, histogram, storage); } - #pragma unroll - for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) +#pragma unroll + for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + hipThreadIdx_x < BinSize) { @@ -113,49 +108,44 @@ struct histogram } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize = BlockSize, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); - const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); + const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, 0.0f); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -167,42 +157,41 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_histogram<"#T", "#BS", "#IPT", " + algorithm_name + ">.") + method_name).c_str(), \ - run_benchmark, \ - stream, size \ - ) - -template +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_histogram<" #T ", " #BS ", " #IPT ", " + algorithm_name + ">.") \ + + method_name) \ + .c_str(), \ + run_benchmark, \ + stream, \ + size) + +template void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) { - std::vector new_benchmarks = - { - CREATE_BENCHMARK(int, 256, 1), - CREATE_BENCHMARK(int, 256, 2), - CREATE_BENCHMARK(int, 256, 3), - CREATE_BENCHMARK(int, 256, 4), - CREATE_BENCHMARK(int, 256, 8), - CREATE_BENCHMARK(int, 256, 11), - CREATE_BENCHMARK(int, 256, 16), - - CREATE_BENCHMARK(int, 320, 1), - CREATE_BENCHMARK(int, 320, 2), - CREATE_BENCHMARK(int, 320, 3), - CREATE_BENCHMARK(int, 320, 4), - CREATE_BENCHMARK(int, 320, 8), - CREATE_BENCHMARK(int, 320, 11), - CREATE_BENCHMARK(int, 320, 16) - }; + std::vector new_benchmarks = {CREATE_BENCHMARK(int, 256, 1), + CREATE_BENCHMARK(int, 256, 2), + CREATE_BENCHMARK(int, 256, 3), + CREATE_BENCHMARK(int, 256, 4), + CREATE_BENCHMARK(int, 256, 8), + CREATE_BENCHMARK(int, 256, 11), + CREATE_BENCHMARK(int, 256, 16), + + CREATE_BENCHMARK(int, 320, 1), + CREATE_BENCHMARK(int, 320, 2), + CREATE_BENCHMARK(int, 320, 3), + CREATE_BENCHMARK(int, 320, 4), + CREATE_BENCHMARK(int, 320, 8), + CREATE_BENCHMARK(int, 320, 11), + CREATE_BENCHMARK(int, 320, 16)}; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -211,13 +200,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; @@ -226,14 +215,10 @@ int main(int argc, char *argv[]) std::vector benchmarks; // using_atomic using histogram_a_t = histogram; - add_benchmarks( - benchmarks, "histogram", "using_atomic", stream, size - ); + add_benchmarks(benchmarks, "histogram", "using_atomic", stream, size); // using_sort using histogram_s_t = histogram; - add_benchmarks( - benchmarks, "histogram", "using_sort", stream, size - ); + add_benchmarks(benchmarks, "histogram", "using_sort", stream, size); // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index 82a807019..015f2bd73 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -61,22 +62,16 @@ enum class benchmark_kinds namespace rp = rocprim; -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -void sort_keys_kernel(const T * input, T * output) +template +__global__ void sort_keys_kernel(const T* input, T* output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rp::block_load_direct_striped(lid, input + block_offset, keys); - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; @@ -86,16 +81,10 @@ void sort_keys_kernel(const T * input, T * output) rp::block_store_direct_striped(lid, output + block_offset, keys); } -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -void sort_pairs_kernel(const T * input, T * output) +template +__global__ void sort_pairs_kernel(const T* input, T* output) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; @@ -106,7 +95,7 @@ void sort_pairs_kernel(const T * input, T * output) values[i] = keys[i] + 1; } - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; @@ -120,41 +109,30 @@ void sort_pairs_kernel(const T * input, T * output) rp::block_store_direct_striped(lid, output + block_offset, keys); } -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 10 -> -void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) +template +void run_benchmark(benchmark::State& state, + benchmark_kinds benchmark_kind, + hipStream_t stream, + size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input; if(std::is_floating_point::value) { - input = get_random_data(size, (T)-1000, (T)+1000); + input = get_random_data(size, (T)-1000, (T) + 1000); } else { input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) @@ -165,24 +143,30 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_keys_kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs_kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -192,105 +176,75 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, IPT) \ -benchmark::RegisterBenchmark( \ - (std::string("block_radix_sort<" #T ", " #BS ", " #IPT ">.") + name).c_str(), \ - run_benchmark, \ - benchmark_kind, stream, size \ -) +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_radix_sort<" #T ", " #BS ", " #IPT ">.") + name).c_str(), \ + run_benchmark, \ + benchmark_kind, \ + stream, \ + size) -void add_benchmarks(benchmark_kinds benchmark_kind, - const std::string& name, +void add_benchmarks(benchmark_kinds benchmark_kind, + const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { - CREATE_BENCHMARK(int, 64, 1), - CREATE_BENCHMARK(int, 64, 2), - CREATE_BENCHMARK(int, 64, 4), - CREATE_BENCHMARK(int, 64, 7), - CREATE_BENCHMARK(int, 64, 8), - CREATE_BENCHMARK(int, 64, 11), - CREATE_BENCHMARK(int, 64, 15), - CREATE_BENCHMARK(int, 64, 19), + std::vector bs = { + CREATE_BENCHMARK(int, 64, 1), CREATE_BENCHMARK(int, 64, 2), + CREATE_BENCHMARK(int, 64, 4), CREATE_BENCHMARK(int, 64, 7), + CREATE_BENCHMARK(int, 64, 8), CREATE_BENCHMARK(int, 64, 11), + CREATE_BENCHMARK(int, 64, 15), CREATE_BENCHMARK(int, 64, 19), CREATE_BENCHMARK(int, 64, 25), - CREATE_BENCHMARK(int, 128, 1), - CREATE_BENCHMARK(int, 128, 2), - CREATE_BENCHMARK(int, 128, 4), - CREATE_BENCHMARK(int, 128, 7), - CREATE_BENCHMARK(int, 128, 8), - CREATE_BENCHMARK(int, 128, 11), - CREATE_BENCHMARK(int, 128, 15), - CREATE_BENCHMARK(int, 128, 19), + CREATE_BENCHMARK(int, 128, 1), CREATE_BENCHMARK(int, 128, 2), + CREATE_BENCHMARK(int, 128, 4), CREATE_BENCHMARK(int, 128, 7), + CREATE_BENCHMARK(int, 128, 8), CREATE_BENCHMARK(int, 128, 11), + CREATE_BENCHMARK(int, 128, 15), CREATE_BENCHMARK(int, 128, 19), CREATE_BENCHMARK(int, 128, 25), - CREATE_BENCHMARK(int, 256, 1), - CREATE_BENCHMARK(int, 256, 2), - CREATE_BENCHMARK(int, 256, 4), - CREATE_BENCHMARK(int, 256, 7), - CREATE_BENCHMARK(int, 256, 8), - CREATE_BENCHMARK(int, 256, 11), - CREATE_BENCHMARK(int, 256, 15), - CREATE_BENCHMARK(int, 256, 19), + CREATE_BENCHMARK(int, 256, 1), CREATE_BENCHMARK(int, 256, 2), + CREATE_BENCHMARK(int, 256, 4), CREATE_BENCHMARK(int, 256, 7), + CREATE_BENCHMARK(int, 256, 8), CREATE_BENCHMARK(int, 256, 11), + CREATE_BENCHMARK(int, 256, 15), CREATE_BENCHMARK(int, 256, 19), CREATE_BENCHMARK(int, 256, 25), - CREATE_BENCHMARK(int, 512, 1), - CREATE_BENCHMARK(int, 512, 2), - CREATE_BENCHMARK(int, 512, 4), - CREATE_BENCHMARK(int, 512, 7), + CREATE_BENCHMARK(int, 512, 1), CREATE_BENCHMARK(int, 512, 2), + CREATE_BENCHMARK(int, 512, 4), CREATE_BENCHMARK(int, 512, 7), CREATE_BENCHMARK(int, 512, 8), - CREATE_BENCHMARK(int, 1024, 1), - CREATE_BENCHMARK(int, 1024, 2), + CREATE_BENCHMARK(int, 1024, 1), CREATE_BENCHMARK(int, 1024, 2), CREATE_BENCHMARK(int, 1024, 4), - CREATE_BENCHMARK(long long, 64, 1), - CREATE_BENCHMARK(long long, 64, 2), - CREATE_BENCHMARK(long long, 64, 4), - CREATE_BENCHMARK(long long, 64, 7), - CREATE_BENCHMARK(long long, 64, 8), - CREATE_BENCHMARK(long long, 64, 11), - CREATE_BENCHMARK(long long, 64, 15), - CREATE_BENCHMARK(long long, 64, 19), + CREATE_BENCHMARK(long long, 64, 1), CREATE_BENCHMARK(long long, 64, 2), + CREATE_BENCHMARK(long long, 64, 4), CREATE_BENCHMARK(long long, 64, 7), + CREATE_BENCHMARK(long long, 64, 8), CREATE_BENCHMARK(long long, 64, 11), + CREATE_BENCHMARK(long long, 64, 15), CREATE_BENCHMARK(long long, 64, 19), CREATE_BENCHMARK(long long, 64, 25), - CREATE_BENCHMARK(long long, 128, 1), - CREATE_BENCHMARK(long long, 128, 2), - CREATE_BENCHMARK(long long, 128, 4), - CREATE_BENCHMARK(long long, 128, 7), - CREATE_BENCHMARK(long long, 128, 8), - CREATE_BENCHMARK(long long, 128, 11), - CREATE_BENCHMARK(long long, 128, 15), - CREATE_BENCHMARK(long long, 128, 19), + CREATE_BENCHMARK(long long, 128, 1), CREATE_BENCHMARK(long long, 128, 2), + CREATE_BENCHMARK(long long, 128, 4), CREATE_BENCHMARK(long long, 128, 7), + CREATE_BENCHMARK(long long, 128, 8), CREATE_BENCHMARK(long long, 128, 11), + CREATE_BENCHMARK(long long, 128, 15), CREATE_BENCHMARK(long long, 128, 19), CREATE_BENCHMARK(long long, 128, 25), - CREATE_BENCHMARK(long long, 256, 1), - CREATE_BENCHMARK(long long, 256, 2), - CREATE_BENCHMARK(long long, 256, 4), - CREATE_BENCHMARK(long long, 256, 7), - CREATE_BENCHMARK(long long, 256, 8), - CREATE_BENCHMARK(long long, 256, 11), - CREATE_BENCHMARK(long long, 256, 15), - CREATE_BENCHMARK(long long, 256, 19), - - CREATE_BENCHMARK(long long, 512, 1), - CREATE_BENCHMARK(long long, 512, 2), - CREATE_BENCHMARK(long long, 512, 4), - CREATE_BENCHMARK(long long, 512, 7), + CREATE_BENCHMARK(long long, 256, 1), CREATE_BENCHMARK(long long, 256, 2), + CREATE_BENCHMARK(long long, 256, 4), CREATE_BENCHMARK(long long, 256, 7), + CREATE_BENCHMARK(long long, 256, 8), CREATE_BENCHMARK(long long, 256, 11), + CREATE_BENCHMARK(long long, 256, 15), CREATE_BENCHMARK(long long, 256, 19), + + CREATE_BENCHMARK(long long, 512, 1), CREATE_BENCHMARK(long long, 512, 2), + CREATE_BENCHMARK(long long, 512, 4), CREATE_BENCHMARK(long long, 512, 7), CREATE_BENCHMARK(long long, 512, 8), - CREATE_BENCHMARK(long long, 1024, 1), - CREATE_BENCHMARK(long long, 1024, 2), + CREATE_BENCHMARK(long long, 1024, 1), CREATE_BENCHMARK(long long, 1024, 2), CREATE_BENCHMARK(long long, 1024, 4), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -299,13 +253,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index 7305541c2..6d186baf9 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -55,30 +56,21 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; namespace rp = rocprim; -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -void kernel(const T* input, T* output) +template +__global__ void kernel(const T* input, T* output) { Runner::template run(input, output); } -template +template struct reduce { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) + template + __device__ static void run(const T* input, T* output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -92,7 +84,7 @@ struct reduce using breduce_t = rp::block_reduce; __shared__ typename breduce_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { breduce_t().reduce(values, reduced_value, storage); @@ -106,47 +98,41 @@ struct reduce } }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, 1.0f); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -158,69 +144,69 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_reduce<"#T", "#BS", "#IPT", " + algorithm_name + ">.") + method_name).c_str(), \ - run_benchmark, \ - stream, size \ - ) - -template +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_reduce<" #T ", " #BS ", " #IPT ", " + algorithm_name + ">.") \ + + method_name) \ + .c_str(), \ + run_benchmark, \ + stream, \ + size) + +template void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) { - using custom_double2 = custom_type; + using custom_double2 = custom_type; using custom_int_double = custom_type; - std::vector new_benchmarks = - { - CREATE_BENCHMARK(float, 256, 1), - CREATE_BENCHMARK(float, 256, 2), - CREATE_BENCHMARK(float, 256, 3), - CREATE_BENCHMARK(float, 256, 4), - CREATE_BENCHMARK(float, 256, 8), - CREATE_BENCHMARK(float, 256, 11), - CREATE_BENCHMARK(float, 256, 16), - - CREATE_BENCHMARK(int, 256, 1), - CREATE_BENCHMARK(int, 256, 2), - CREATE_BENCHMARK(int, 256, 3), - CREATE_BENCHMARK(int, 256, 4), - CREATE_BENCHMARK(int, 256, 8), - CREATE_BENCHMARK(int, 256, 11), - CREATE_BENCHMARK(int, 256, 16), - - CREATE_BENCHMARK(int, 320, 1), - CREATE_BENCHMARK(int, 320, 2), - CREATE_BENCHMARK(int, 320, 3), - CREATE_BENCHMARK(int, 320, 4), - CREATE_BENCHMARK(int, 320, 8), - CREATE_BENCHMARK(int, 320, 11), - CREATE_BENCHMARK(int, 320, 16), - - CREATE_BENCHMARK(double, 256, 1), - CREATE_BENCHMARK(double, 256, 2), - CREATE_BENCHMARK(double, 256, 3), - CREATE_BENCHMARK(double, 256, 4), - CREATE_BENCHMARK(double, 256, 8), - CREATE_BENCHMARK(double, 256, 11), - CREATE_BENCHMARK(double, 256, 16), - - CREATE_BENCHMARK(custom_double2, 256, 1), - CREATE_BENCHMARK(custom_double2, 256, 4), - CREATE_BENCHMARK(custom_double2, 256, 8), - - CREATE_BENCHMARK(custom_int_double, 256, 1), - CREATE_BENCHMARK(custom_int_double, 256, 4), - CREATE_BENCHMARK(custom_int_double, 256, 8) - }; + std::vector new_benchmarks + = {CREATE_BENCHMARK(float, 256, 1), + CREATE_BENCHMARK(float, 256, 2), + CREATE_BENCHMARK(float, 256, 3), + CREATE_BENCHMARK(float, 256, 4), + CREATE_BENCHMARK(float, 256, 8), + CREATE_BENCHMARK(float, 256, 11), + CREATE_BENCHMARK(float, 256, 16), + + CREATE_BENCHMARK(int, 256, 1), + CREATE_BENCHMARK(int, 256, 2), + CREATE_BENCHMARK(int, 256, 3), + CREATE_BENCHMARK(int, 256, 4), + CREATE_BENCHMARK(int, 256, 8), + CREATE_BENCHMARK(int, 256, 11), + CREATE_BENCHMARK(int, 256, 16), + + CREATE_BENCHMARK(int, 320, 1), + CREATE_BENCHMARK(int, 320, 2), + CREATE_BENCHMARK(int, 320, 3), + CREATE_BENCHMARK(int, 320, 4), + CREATE_BENCHMARK(int, 320, 8), + CREATE_BENCHMARK(int, 320, 11), + CREATE_BENCHMARK(int, 320, 16), + + CREATE_BENCHMARK(double, 256, 1), + CREATE_BENCHMARK(double, 256, 2), + CREATE_BENCHMARK(double, 256, 3), + CREATE_BENCHMARK(double, 256, 4), + CREATE_BENCHMARK(double, 256, 8), + CREATE_BENCHMARK(double, 256, 11), + CREATE_BENCHMARK(double, 256, 16), + + CREATE_BENCHMARK(custom_double2, 256, 1), + CREATE_BENCHMARK(custom_double2, 256, 4), + CREATE_BENCHMARK(custom_double2, 256, 8), + + CREATE_BENCHMARK(custom_int_double, 256, 1), + CREATE_BENCHMARK(custom_int_double, 256, 4), + CREATE_BENCHMARK(custom_int_double, 256, 8)}; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -229,13 +215,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; @@ -244,14 +230,10 @@ int main(int argc, char *argv[]) std::vector benchmarks; // using_warp_scan using reduce_uwr_t = reduce; - add_benchmarks( - benchmarks, "reduce", "using_warp_reduce", stream, size - ); + add_benchmarks(benchmarks, "reduce", "using_warp_reduce", stream, size); // reduce then scan using reduce_rr_t = reduce; - add_benchmarks( - benchmarks, "reduce", "raking_reduce", stream, size - ); + add_benchmarks(benchmarks, "reduce", "raking_reduce", stream, size); // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index 9cdadc115..cbc60081b 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -55,30 +56,21 @@ const size_t DEFAULT_N = 1024 * 1024 * 128; namespace rp = rocprim; -template< - class Runner, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials -> -__global__ -void kernel(const T* input, T* output) +template +__global__ void kernel(const T* input, T* output) { Runner::template run(input, output); } -template +template struct inclusive_scan { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) + template + __device__ static void run(const T* input, T* output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -91,7 +83,7 @@ struct inclusive_scan using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().inclusive_scan(values, values, storage); @@ -102,23 +94,16 @@ struct inclusive_scan output[i * ItemsPerThread + k] = values[k]; } } - }; -template +template struct exclusive_scan { - template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials - > - __device__ - static void run(const T* input, T* output) + template + __device__ static void run(const T* input, T* output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - using U = typename std::remove_reference::type; + using U = typename std::remove_reference::type; T values[ItemsPerThread]; U init = 100; @@ -131,7 +116,7 @@ struct exclusive_scan using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().exclusive_scan(values, values, init, storage); @@ -142,50 +127,43 @@ struct exclusive_scan output[i * ItemsPerThread + k] = values[k]; } } - }; -template< - class Benchmark, - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; - const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); + const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, 1.0f); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel), - dim3(size/items_per_block), dim3(BlockSize), 0, stream, - d_input, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), + dim3(size / items_per_block), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -197,87 +175,88 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) } // IPT - items per thread -#define CREATE_BENCHMARK(T, BS, IPT) \ - benchmark::RegisterBenchmark( \ - (std::string("block_scan<"#T", "#BS", "#IPT", " + algorithm_name + ">.") + method_name).c_str(), \ - run_benchmark, \ - stream, size \ - ) - -template +#define CREATE_BENCHMARK(T, BS, IPT) \ + benchmark::RegisterBenchmark( \ + (std::string("block_scan<" #T ", " #BS ", " #IPT ", " + algorithm_name + ">.") \ + + method_name) \ + .c_str(), \ + run_benchmark, \ + stream, \ + size) + +template void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - const std::string& algorithm_name, - hipStream_t stream, - size_t size) + const std::string& method_name, + const std::string& algorithm_name, + hipStream_t stream, + size_t size) { - using custom_double2 = custom_type; + using custom_double2 = custom_type; using custom_int_double = custom_type; - std::vector new_benchmarks = - { - // When block size is less than or equal to warp size - CREATE_BENCHMARK(int, 64, 1), - CREATE_BENCHMARK(int, 64, 2), - CREATE_BENCHMARK(int, 64, 4), - CREATE_BENCHMARK(int, 64, 8), - CREATE_BENCHMARK(int, 64, 16), - CREATE_BENCHMARK(float, 64, 1), - CREATE_BENCHMARK(float, 64, 2), - CREATE_BENCHMARK(float, 64, 4), - CREATE_BENCHMARK(float, 64, 8), - CREATE_BENCHMARK(float, 64, 16), - CREATE_BENCHMARK(double, 64, 1), - CREATE_BENCHMARK(double, 64, 2), - CREATE_BENCHMARK(double, 64, 4), - CREATE_BENCHMARK(double, 64, 8), - CREATE_BENCHMARK(double, 64, 16), - - CREATE_BENCHMARK(float, 256, 1), - CREATE_BENCHMARK(float, 256, 2), - CREATE_BENCHMARK(float, 256, 3), - CREATE_BENCHMARK(float, 256, 4), - CREATE_BENCHMARK(float, 256, 8), - CREATE_BENCHMARK(float, 256, 11), - CREATE_BENCHMARK(float, 256, 16), - - CREATE_BENCHMARK(int, 256, 1), - CREATE_BENCHMARK(int, 256, 2), - CREATE_BENCHMARK(int, 256, 3), - CREATE_BENCHMARK(int, 256, 4), - CREATE_BENCHMARK(int, 256, 8), - CREATE_BENCHMARK(int, 256, 11), - CREATE_BENCHMARK(int, 256, 16), - - CREATE_BENCHMARK(int, 320, 1), - CREATE_BENCHMARK(int, 320, 2), - CREATE_BENCHMARK(int, 320, 3), - CREATE_BENCHMARK(int, 320, 4), - CREATE_BENCHMARK(int, 320, 8), - CREATE_BENCHMARK(int, 320, 11), - CREATE_BENCHMARK(int, 320, 16), - - CREATE_BENCHMARK(double, 256, 1), - CREATE_BENCHMARK(double, 256, 2), - CREATE_BENCHMARK(double, 256, 3), - CREATE_BENCHMARK(double, 256, 4), - CREATE_BENCHMARK(double, 256, 8), - CREATE_BENCHMARK(double, 256, 11), - CREATE_BENCHMARK(double, 256, 16), - - CREATE_BENCHMARK(custom_double2, 256, 1), - CREATE_BENCHMARK(custom_double2, 256, 4), - CREATE_BENCHMARK(custom_double2, 256, 8), - - CREATE_BENCHMARK(custom_int_double, 256, 1), - CREATE_BENCHMARK(custom_int_double, 256, 4), - CREATE_BENCHMARK(custom_int_double, 256, 8) - - }; + std::vector new_benchmarks + = {// When block size is less than or equal to warp size + CREATE_BENCHMARK(int, 64, 1), + CREATE_BENCHMARK(int, 64, 2), + CREATE_BENCHMARK(int, 64, 4), + CREATE_BENCHMARK(int, 64, 8), + CREATE_BENCHMARK(int, 64, 16), + CREATE_BENCHMARK(float, 64, 1), + CREATE_BENCHMARK(float, 64, 2), + CREATE_BENCHMARK(float, 64, 4), + CREATE_BENCHMARK(float, 64, 8), + CREATE_BENCHMARK(float, 64, 16), + CREATE_BENCHMARK(double, 64, 1), + CREATE_BENCHMARK(double, 64, 2), + CREATE_BENCHMARK(double, 64, 4), + CREATE_BENCHMARK(double, 64, 8), + CREATE_BENCHMARK(double, 64, 16), + + CREATE_BENCHMARK(float, 256, 1), + CREATE_BENCHMARK(float, 256, 2), + CREATE_BENCHMARK(float, 256, 3), + CREATE_BENCHMARK(float, 256, 4), + CREATE_BENCHMARK(float, 256, 8), + CREATE_BENCHMARK(float, 256, 11), + CREATE_BENCHMARK(float, 256, 16), + + CREATE_BENCHMARK(int, 256, 1), + CREATE_BENCHMARK(int, 256, 2), + CREATE_BENCHMARK(int, 256, 3), + CREATE_BENCHMARK(int, 256, 4), + CREATE_BENCHMARK(int, 256, 8), + CREATE_BENCHMARK(int, 256, 11), + CREATE_BENCHMARK(int, 256, 16), + + CREATE_BENCHMARK(int, 320, 1), + CREATE_BENCHMARK(int, 320, 2), + CREATE_BENCHMARK(int, 320, 3), + CREATE_BENCHMARK(int, 320, 4), + CREATE_BENCHMARK(int, 320, 8), + CREATE_BENCHMARK(int, 320, 11), + CREATE_BENCHMARK(int, 320, 16), + + CREATE_BENCHMARK(double, 256, 1), + CREATE_BENCHMARK(double, 256, 2), + CREATE_BENCHMARK(double, 256, 3), + CREATE_BENCHMARK(double, 256, 4), + CREATE_BENCHMARK(double, 256, 8), + CREATE_BENCHMARK(double, 256, 11), + CREATE_BENCHMARK(double, 256, 16), + + CREATE_BENCHMARK(custom_double2, 256, 1), + CREATE_BENCHMARK(custom_double2, 256, 4), + CREATE_BENCHMARK(custom_double2, 256, 8), + + CREATE_BENCHMARK(custom_int_double, 256, 1), + CREATE_BENCHMARK(custom_int_double, 256, 4), + CREATE_BENCHMARK(custom_int_double, 256, 8) + + }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -286,13 +265,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; @@ -302,23 +281,19 @@ int main(int argc, char *argv[]) // inclusive_scan using_warp_scan using inclusive_scan_uws_t = inclusive_scan; add_benchmarks( - benchmarks, "inclusive_scan", "using_warp_scan", stream, size - ); + benchmarks, "inclusive_scan", "using_warp_scan", stream, size); // exclusive_scan using_warp_scan using exclusive_scan_uws_t = exclusive_scan; add_benchmarks( - benchmarks, "exclusive_scan", "using_warp_scan", stream, size - ); + benchmarks, "exclusive_scan", "using_warp_scan", stream, size); // inclusive_scan reduce then scan using inclusive_scan_rts_t = inclusive_scan; add_benchmarks( - benchmarks, "inclusive_scan", "reduce_then_scan", stream, size - ); + benchmarks, "inclusive_scan", "reduce_then_scan", stream, size); // exclusive_scan reduce then scan using exclusive_scan_rts_t = exclusive_scan; add_benchmarks( - benchmarks, "exclusive_scan", "reduce_then_scan", stream, size - ); + benchmarks, "exclusive_scan", "reduce_then_scan", stream, size); // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_block_sort.cpp b/benchmark/benchmark_block_sort.cpp index bb8a6016b..c1ddf1f43 100644 --- a/benchmark/benchmark_block_sort.cpp +++ b/benchmark/benchmark_block_sort.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; @@ -61,19 +62,14 @@ enum class benchmark_kinds namespace rp = rocprim; -template< - class T, - unsigned int BlockSize, - unsigned int Trials -> -__global__ -void sort_keys_kernel(const T * input, T * output) +template +__global__ void sort_keys_kernel(const T* input, T* output) { const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; T key = input[index]; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_sort bsort; @@ -83,20 +79,15 @@ void sort_keys_kernel(const T * input, T * output) output[index] = key; } -template< - class T, - unsigned int BlockSize, - unsigned int Trials -> -__global__ -void sort_pairs_kernel(const T * input, T * output) +template +__global__ void sort_pairs_kernel(const T* input, T* output) { const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T key = input[index]; + T key = input[index]; T value = key + 1; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_sort bsort; @@ -106,40 +97,30 @@ void sort_pairs_kernel(const T * input, T * output) output[index] = key + value; } -template< - class T, - unsigned int BlockSize, - unsigned int Trials = 10 -> -void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) +template +void run_benchmark(benchmark::State& state, + benchmark_kinds benchmark_kind, + hipStream_t stream, + size_t N) { constexpr auto block_size = BlockSize; - const auto size = block_size * ((N + block_size - 1)/block_size); + const auto size = block_size * ((N + block_size - 1) / block_size); std::vector input; if(std::is_floating_point::value) { - input = get_random_data(size, (T)-1000, (T)+1000); + input = get_random_data(size, (T)-1000, (T) + 1000); } else { input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) @@ -148,26 +129,30 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS if(benchmark_kind == benchmark_kinds::sort_keys) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_keys_kernel), - dim3(size/block_size), dim3(BlockSize), 0, stream, - d_input, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_keys_kernel), + dim3(size / block_size), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_pairs_kernel), - dim3(size/block_size), dim3(BlockSize), 0, stream, - d_input, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_pairs_kernel), + dim3(size / block_size), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -177,42 +162,39 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS) \ -benchmark::RegisterBenchmark( \ - (std::string("block_sort<" #T ", " #BS ">.") + name).c_str(), \ - run_benchmark, \ - benchmark_kind, stream, size \ -) +#define CREATE_BENCHMARK(T, BS) \ + benchmark::RegisterBenchmark((std::string("block_sort<" #T ", " #BS ">.") + name).c_str(), \ + run_benchmark, \ + benchmark_kind, \ + stream, \ + size) -void add_benchmarks(benchmark_kinds benchmark_kind, - const std::string& name, +void add_benchmarks(benchmark_kinds benchmark_kind, + const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { - CREATE_BENCHMARK(int, 64), - CREATE_BENCHMARK(int, 128), - CREATE_BENCHMARK(int, 192), - CREATE_BENCHMARK(int, 256), - CREATE_BENCHMARK(int, 320), - CREATE_BENCHMARK(int, 512), - CREATE_BENCHMARK(int, 1024), - - CREATE_BENCHMARK(long long, 64), - CREATE_BENCHMARK(long long, 128), - CREATE_BENCHMARK(long long, 192), - CREATE_BENCHMARK(long long, 256), - CREATE_BENCHMARK(long long, 320), - CREATE_BENCHMARK(long long, 512), - CREATE_BENCHMARK(long long, 1024) - }; + std::vector bs = {CREATE_BENCHMARK(int, 64), + CREATE_BENCHMARK(int, 128), + CREATE_BENCHMARK(int, 192), + CREATE_BENCHMARK(int, 256), + CREATE_BENCHMARK(int, 320), + CREATE_BENCHMARK(int, 512), + CREATE_BENCHMARK(int, 1024), + + CREATE_BENCHMARK(long long, 64), + CREATE_BENCHMARK(long long, 128), + CREATE_BENCHMARK(long long, 192), + CREATE_BENCHMARK(long long, 256), + CREATE_BENCHMARK(long long, 320), + CREATE_BENCHMARK(long long, 512), + CREATE_BENCHMARK(long long, 1024)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -221,13 +203,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_binary_search.cpp b/benchmark/benchmark_device_binary_search.cpp index 5a84d8f46..b0b54d0e3 100644 --- a/benchmark/benchmark_device_binary_search.cpp +++ b/benchmark/benchmark_device_binary_search.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,90 +40,82 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -void run_lower_bound_benchmark(benchmark::State& state, hipStream_t stream, - size_t haystack_size, size_t needles_size, - bool sorted_needles) +template +void run_lower_bound_benchmark(benchmark::State& state, + hipStream_t stream, + size_t haystack_size, + size_t needles_size, + bool sorted_needles) { using haystack_type = T; - using needle_type = T; - using output_type = size_t; + using needle_type = T; + using output_type = size_t; // Generate data std::vector haystack(haystack_size); std::iota(haystack.begin(), haystack.end(), 0); - std::vector needles = get_random_data( - needles_size, needle_type(0), needle_type(haystack_size) - ); + std::vector needles + = get_random_data(needles_size, needle_type(0), needle_type(haystack_size)); if(sorted_needles) { std::sort(needles.begin(), needles.end()); } - haystack_type * d_haystack; - needle_type * d_needles; - output_type * d_output; + haystack_type* d_haystack; + needle_type* d_needles; + output_type* d_output; HIP_CHECK(hipMalloc(&d_haystack, haystack_size * sizeof(haystack_type))); HIP_CHECK(hipMalloc(&d_needles, needles_size * sizeof(needle_type))); HIP_CHECK(hipMalloc(&d_output, needles_size * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_haystack, haystack.data(), - haystack_size * sizeof(haystack_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_needles, needles.data(), - needles_size * sizeof(needle_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy( + d_haystack, haystack.data(), haystack_size * sizeof(haystack_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_needles, needles.data(), needles_size * sizeof(needle_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK( - rocprim::lower_bound( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - rocprim::less<>(), - stream - ) - ); + HIP_CHECK(rocprim::lower_bound(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + rocprim::less<>(), + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rocprim::lower_bound( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - rocprim::less<>(), - stream - ) - ); + HIP_CHECK(rocprim::lower_bound(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + rocprim::less<>(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -133,21 +125,21 @@ void run_lower_bound_benchmark(benchmark::State& state, hipStream_t stream, for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rocprim::lower_bound( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - rocprim::less<>(), - stream - ) - ); + HIP_CHECK(rocprim::lower_bound(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + rocprim::less<>(), + stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * needles_size * sizeof(needle_type)); @@ -159,16 +151,16 @@ void run_lower_bound_benchmark(benchmark::State& state, hipStream_t stream, HIP_CHECK(hipFree(d_output)); } -#define CREATE_LOWER_BOUND_BENCHMARK(T, K, SORTED) \ -benchmark::RegisterBenchmark( \ - ( \ - std::string("lower_bound") + "<" #T ">(" #K "\% " + \ - (SORTED ? "sorted" : "random") + " needles)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_lower_bound_benchmark(state, stream, size, size * K / 100, SORTED); } \ -) +#define CREATE_LOWER_BOUND_BENCHMARK(T, K, SORTED) \ + benchmark::RegisterBenchmark((std::string("lower_bound") + "<" #T ">(" #K "\% " \ + + (SORTED ? "sorted" : "random") + " needles)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_lower_bound_benchmark( \ + state, stream, size, size * K / 100, SORTED); \ + }) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -177,23 +169,22 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { CREATE_LOWER_BOUND_BENCHMARK(float, 10, false), CREATE_LOWER_BOUND_BENCHMARK(double, 10, false), CREATE_LOWER_BOUND_BENCHMARK(custom_float2, 10, false), diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index df154a30e..b9d09c548 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -20,18 +20,18 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include +#include +#include #include #include -#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -39,14 +39,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -54,10 +55,10 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template +template std::vector generate(size_t size, int entropy_reduction, int lower_level, int upper_level) { if(entropy_reduction >= 5) @@ -67,24 +68,20 @@ std::vector generate(size_t size, int entropy_reduction, int lower_level, int const size_t max_random_size = 1024 * 1024; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { + // Reduce enthropy by applying bitwise AND to random bits + // "An Improved Supercomputer Sorting Benchmark", 1992 + // Kurt Thearling & Stephen Smith + auto v = gen(); + for(int e = 0; e < entropy_reduction; e++) { - // Reduce enthropy by applying bitwise AND to random bits - // "An Improved Supercomputer Sorting Benchmark", 1992 - // Kurt Thearling & Stephen Smith - auto v = gen(); - for(int e = 0; e < entropy_reduction; e++) - { - v &= gen(); - } - return T(lower_level + v % (upper_level - lower_level)); + v &= gen(); } - ); + return T(lower_level + v % (upper_level - lower_level)); + }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); @@ -96,24 +93,30 @@ int get_entropy_percents(int entropy_reduction) { switch(entropy_reduction) { - case 0: return 100; - case 1: return 81; - case 2: return 54; - case 3: return 33; - case 4: return 20; - default: return 0; + case 0: + return 100; + case 1: + return 81; + case 2: + return 54; + case 3: + return 33; + case 4: + return 20; + default: + return 0; } } -const int entropy_reductions[] = { 0, 2, 4, 6 }; +const int entropy_reductions[] = {0, 2, 4, 6}; -template +template void run_even_benchmark(benchmark::State& state, - size_t bins, - size_t scale, - int entropy_reduction, - hipStream_t stream, - size_t size) + size_t bins, + size_t scale, + int entropy_reduction, + hipStream_t stream, + size_t size) { using counter_type = unsigned int; @@ -123,29 +126,24 @@ void run_even_benchmark(benchmark::State& state, // Generate data std::vector input = generate(size, entropy_reduction, lower_level, upper_level); - T * d_input; - counter_type * d_histogram; + T* d_input; + counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - bins + 1, lower_level, upper_level, - stream, false - ) - ); + HIP_CHECK(rp::histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + bins + 1, + lower_level, + upper_level, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -153,39 +151,41 @@ void run_even_benchmark(benchmark::State& state, // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - bins + 1, lower_level, upper_level, - stream, false - ) - ); + HIP_CHECK(rp::histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + bins + 1, + lower_level, + upper_level, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - bins + 1, lower_level, upper_level, - stream, false - ) - ); + HIP_CHECK(rp::histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + bins + 1, + lower_level, + upper_level, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -196,55 +196,51 @@ void run_even_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_histogram)); } -template +template void run_multi_even_benchmark(benchmark::State& state, - size_t bins, - size_t scale, - int entropy_reduction, - hipStream_t stream, - size_t size) + size_t bins, + size_t scale, + int entropy_reduction, + hipStream_t stream, + size_t size) { using counter_type = unsigned int; unsigned int num_levels[ActiveChannels]; - int lower_level[ActiveChannels]; - int upper_level[ActiveChannels]; + int lower_level[ActiveChannels]; + int upper_level[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { lower_level[channel] = 0; upper_level[channel] = bins * scale; - num_levels[channel] = bins + 1; + num_levels[channel] = bins + 1; } // Generate data - std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); + std::vector input + = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); - T * d_input; - counter_type * d_histogram[ActiveChannels]; + T* d_input; + counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * Channels * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK(( - rp::multi_histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - num_levels, lower_level, upper_level, - stream, false - ) - )); + HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + num_levels, + lower_level, + upper_level, + stream, + false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -252,39 +248,41 @@ void run_multi_even_benchmark(benchmark::State& state, // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(( - rp::multi_histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - num_levels, lower_level, upper_level, - stream, false - ) - )); + HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + num_levels, + lower_level, + upper_level, + stream, + false))); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK(( - rp::multi_histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - num_levels, lower_level, upper_level, - stream, false - ) - )); + HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + num_levels, + lower_level, + upper_level, + stream, + false))); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); @@ -298,7 +296,7 @@ void run_multi_even_benchmark(benchmark::State& state, } } -template +template void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) { using counter_type = unsigned int; @@ -309,38 +307,26 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea std::vector levels(bins + 1); std::iota(levels.begin(), levels.end(), 0); - T * d_input; - T * d_levels; - counter_type * d_histogram; + T* d_input; + T* d_levels; + counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_levels, levels.data(), - (bins + 1) * sizeof(T), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(T), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::histogram_range( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - bins + 1, d_levels, - stream, false - ) - ); + HIP_CHECK(rp::histogram_range(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + bins + 1, + d_levels, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -348,39 +334,39 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::histogram_range( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - bins + 1, d_levels, - stream, false - ) - ); + HIP_CHECK(rp::histogram_range(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + bins + 1, + d_levels, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::histogram_range( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_histogram, - bins + 1, d_levels, - stream, false - ) - ); + HIP_CHECK(rp::histogram_range(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_histogram, + bins + 1, + d_levels, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -392,24 +378,23 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea HIP_CHECK(hipFree(d_histogram)); } -#define CREATE_EVEN_BENCHMARK(T, BINS, SCALE) \ -benchmark::RegisterBenchmark( \ - (std::string("histogram_even") + "<" #T ">" + \ - "(" + std::to_string(get_entropy_percents(entropy_reduction)) + "% entropy, " + \ - std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); } \ -) +#define CREATE_EVEN_BENCHMARK(T, BINS, SCALE) \ + benchmark::RegisterBenchmark((std::string("histogram_even") + "<" #T ">" + "(" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "% entropy, " + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_even_benchmark( \ + state, BINS, SCALE, entropy_reduction, stream, size); \ + }) void add_even_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { for(int entropy_reduction : entropy_reductions) { - std::vector bs = - { + std::vector bs = { CREATE_EVEN_BENCHMARK(int, 10, 1234), CREATE_EVEN_BENCHMARK(int, 100, 1234), CREATE_EVEN_BENCHMARK(int, 1000, 1234), @@ -428,27 +413,24 @@ void add_even_benchmarks(std::vector& benchmark }; } -#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ -benchmark::RegisterBenchmark( \ - (std::string("multi_histogram_even") + "<" #CHANNELS ", " #ACTIVE_CHANNELS ", " #T ">" + \ - "(" + std::to_string(get_entropy_percents(entropy_reduction)) + "% entropy, " + \ - std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { \ - run_multi_even_benchmark( \ - state, BINS, SCALE, entropy_reduction, stream, size \ - ); \ - } \ -) +#define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ + benchmark::RegisterBenchmark((std::string("multi_histogram_even") \ + + "<" #CHANNELS ", " #ACTIVE_CHANNELS ", " #T ">" + "(" \ + + std::to_string(get_entropy_percents(entropy_reduction)) \ + + "% entropy, " + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_multi_even_benchmark( \ + state, BINS, SCALE, entropy_reduction, stream, size); \ + }) void add_multi_even_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { for(int entropy_reduction : entropy_reductions) { - std::vector bs = - { + std::vector bs = { CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), @@ -463,20 +445,17 @@ void add_multi_even_benchmarks(std::vector& ben }; } -#define CREATE_RANGE_BENCHMARK(T, BINS) \ -benchmark::RegisterBenchmark( \ - (std::string("histogram_range") + "<" #T ">" + \ - "(" + std::to_string(BINS) + " bins)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_range_benchmark(state, BINS, stream, size); } \ -) +#define CREATE_RANGE_BENCHMARK(T, BINS) \ + benchmark::RegisterBenchmark( \ + (std::string("histogram_range") + "<" #T ">" + "(" + std::to_string(BINS) + " bins)") \ + .c_str(), \ + [=](benchmark::State& state) { run_range_benchmark(state, BINS, stream, size); }) void add_range_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { CREATE_RANGE_BENCHMARK(float, 10), CREATE_RANGE_BENCHMARK(float, 100), CREATE_RANGE_BENCHMARK(float, 1000), @@ -487,7 +466,7 @@ void add_range_benchmarks(std::vector& benchmar benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -496,13 +475,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 8b21d4a0a..022d7298a 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -20,11 +20,11 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include #include #include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" @@ -35,14 +35,15 @@ #include "benchmark_utils.hpp" -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } enum memory_operation_method { @@ -63,20 +64,18 @@ enum kernel_operation atomics_inter_warp_collision, }; -template< - kernel_operation Operation, - class T, - unsigned int ItemsPerThread, - unsigned int BlockSize = 0 -> +template struct operation; // no operation -template +template struct operation { - ROCPRIM_HOST_DEVICE inline - void operator()(T (&)[ItemsPerThread], void* = nullptr, unsigned int = 0, T* = nullptr) + ROCPRIM_HOST_DEVICE inline void + operator()(T (&)[ItemsPerThread], void* = nullptr, unsigned int = 0, T* = nullptr) { // No operation } @@ -85,22 +84,22 @@ struct operation #define repeats 30 // custom operation -template +template struct operation { - ROCPRIM_HOST_DEVICE inline - void operator()(T (&input)[ItemsPerThread], - void* shared_storage = nullptr, unsigned int shared_storage_size = 0, - T* global_mem_output = nullptr) + ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], + void* shared_storage = nullptr, + unsigned int shared_storage_size = 0, + T* global_mem_output = nullptr) { - (void) shared_storage; - (void) shared_storage_size; - (void) global_mem_output; - #pragma unroll + (void)shared_storage; + (void)shared_storage_size; + (void)global_mem_output; +#pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] = input[i] + 666; - #pragma unroll +#pragma unroll for(unsigned int j = 0; j < repeats; j++) { input[i] = input[i] * (input[j % ItemsPerThread]); @@ -110,52 +109,52 @@ struct operation }; // block scan -template +template struct operation { - ROCPRIM_HOST_DEVICE inline - void operator()(T (&input)[ItemsPerThread], - void* shared_storage = nullptr, unsigned int shared_storage_size = 0, - T* global_mem_output = nullptr) + ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], + void* shared_storage = nullptr, + unsigned int shared_storage_size = 0, + T* global_mem_output = nullptr) { - (void) global_mem_output; - using block_scan_type = typename rocprim::block_scan< - T, BlockSize, rocprim::block_scan_algorithm::using_warp_scan>; + (void)global_mem_output; + using block_scan_type = typename rocprim:: + block_scan; block_scan_type bscan; // when using vectorized or striped functions // NOTE: This is not safe but it is the easiest way to prevent code repetition - if(shared_storage == nullptr || - shared_storage_size < sizeof(typename block_scan_type::storage_type)) + if(shared_storage == nullptr + || shared_storage_size < sizeof(typename block_scan_type::storage_type)) { __shared__ typename block_scan_type::storage_type storage; shared_storage = &storage; } bscan.inclusive_scan( - input, input, - *(reinterpret_cast(shared_storage)) - ); + input, + input, + *(reinterpret_cast(shared_storage))); __syncthreads(); } }; // atomics_no_collision -template +template struct operation { - ROCPRIM_HOST_DEVICE inline - void operator()(T (&input)[ItemsPerThread], - void* shared_storage = nullptr, unsigned int shared_storage_size = 0, - T* global_mem_output = nullptr) + ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], + void* shared_storage = nullptr, + unsigned int shared_storage_size = 0, + T* global_mem_output = nullptr) { - (void) shared_storage; - (void) shared_storage_size; - (void) input; - unsigned int index = hipThreadIdx_x * ItemsPerThread + - hipBlockIdx_x * hipBlockDim_x * ItemsPerThread; - #pragma unroll + (void)shared_storage; + (void)shared_storage_size; + (void)input; + unsigned int index + = hipThreadIdx_x * ItemsPerThread + hipBlockIdx_x * hipBlockDim_x * ItemsPerThread; +#pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -164,20 +163,20 @@ struct operation }; // atomics_inter_block_collision -template +template struct operation { - ROCPRIM_HOST_DEVICE inline - void operator()(T (&input)[ItemsPerThread], - void* shared_storage = nullptr, unsigned int shared_storage_size = 0, - T* global_mem_output = nullptr) + ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], + void* shared_storage = nullptr, + unsigned int shared_storage_size = 0, + T* global_mem_output = nullptr) { - (void) shared_storage; - (void) shared_storage_size; - (void) input; - unsigned int index = (hipThreadIdx_x % warpSize) * ItemsPerThread + - hipBlockIdx_x * hipBlockDim_x * ItemsPerThread; - #pragma unroll + (void)shared_storage; + (void)shared_storage_size; + (void)input; + unsigned int index = (hipThreadIdx_x % warpSize) * ItemsPerThread + + hipBlockIdx_x * hipBlockDim_x * ItemsPerThread; +#pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -186,19 +185,19 @@ struct operation }; // atomics_inter_block_collision -template +template struct operation { - ROCPRIM_HOST_DEVICE inline - void operator()(T (&input)[ItemsPerThread], - void* shared_storage = nullptr, unsigned int shared_storage_size = 0, - T* global_mem_output = nullptr) + ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], + void* shared_storage = nullptr, + unsigned int shared_storage_size = 0, + T* global_mem_output = nullptr) { - (void) shared_storage; - (void) shared_storage_size; - (void) input; + (void)shared_storage; + (void)shared_storage_size; + (void)input; unsigned int index = hipThreadIdx_x * ItemsPerThread; - #pragma unroll +#pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -207,31 +206,27 @@ struct operation }; // block_primitive_direct method base kernel -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - memory_operation_method MemOp, - class CustomOp = - typename operation::value_type, - typename std::enable_if::type = 0 -> -__global__ -void operation_kernel(T* input, T* output, CustomOp op) +template ::value_type, + typename std::enable_if::type = 0> +__global__ void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - using block_load_type = typename rocprim::block_load< - T, BlockSize, ItemsPerThread, rocprim::block_load_method::block_load_direct>; - using block_store_type = typename rocprim::block_store< - T, BlockSize, ItemsPerThread, rocprim::block_store_method::block_store_direct>; + using block_load_type = typename rocprim:: + block_load; + using block_store_type = typename rocprim:: + block_store; - block_load_type load; + block_load_type load; block_store_type store; __shared__ union { - typename block_load_type::storage_type load; + typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; @@ -245,79 +240,70 @@ void operation_kernel(T* input, T* output, CustomOp op) } // vectorized method base kernel -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - memory_operation_method MemOp, - class CustomOp = - typename operation::value_type, - typename std::enable_if::type = 0 -> -__global__ -void operation_kernel(T* input, T* output, CustomOp op) +template ::value_type, + typename std::enable_if::type = 0> +__global__ void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - int offset = hipBlockIdx_x * items_per_block; - T items[ItemsPerThread]; + int offset = hipBlockIdx_x * items_per_block; + T items[ItemsPerThread]; - rocprim::block_load_direct_blocked_vectorized - (hipThreadIdx_x, input + offset, items); + rocprim::block_load_direct_blocked_vectorized( + hipThreadIdx_x, input + offset, items); __syncthreads(); op(items, nullptr, 0, output); - rocprim::block_store_direct_blocked_vectorized - (hipThreadIdx_x, output + offset, items); + rocprim::block_store_direct_blocked_vectorized( + hipThreadIdx_x, output + offset, items); } // striped method base kernel -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - memory_operation_method MemOp, - class CustomOp = - typename operation::value_type, - typename std::enable_if::type = 0 -> -__global__ -void operation_kernel(T* input, T* output, CustomOp op) +template ::value_type, + typename std::enable_if::type = 0> +__global__ void operation_kernel(T* input, T* output, CustomOp op) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; - T items[ItemsPerThread]; + T items[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, items); op(items, nullptr, 0, output); rocprim::block_store_direct_striped(lid, output + block_offset, items); } // block_primitives_transpose method base kernel -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - memory_operation_method MemOp, - class CustomOp = - typename operation::value_type, - typename std::enable_if::type = 0 -> -__global__ -void operation_kernel(T* input, T* output, CustomOp op) +template ::value_type, + typename std::enable_if::type = 0> +__global__ void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - using block_load_type = typename rocprim::block_load< - T, BlockSize, ItemsPerThread, rocprim::block_load_method::block_load_transpose>; - using block_store_type = typename rocprim::block_store< - T, BlockSize, ItemsPerThread, rocprim::block_store_method::block_store_transpose>; + using block_load_type = typename rocprim:: + block_load; + using block_store_type = + typename rocprim::block_store; - block_load_type load; + block_load_type load; block_store_type store; __shared__ union { - typename block_load_type::storage_type load; + typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; @@ -330,42 +316,29 @@ void operation_kernel(T* input, T* output, CustomOp op) store.store(output + offset, items, storage.store); } -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - memory_operation_method MemOp, - kernel_operation KernelOp = no_operation -> -void run_benchmark(benchmark::State& state, - size_t size, - const hipStream_t stream) +template +void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream) { - const size_t grid_size = size / (BlockSize * ItemsPerThread); + const size_t grid_size = size / (BlockSize * ItemsPerThread); std::vector input; if(std::is_floating_point::value) { - input = get_random_data(size, (T)-1000, (T)+1000); + input = get_random_data(size, (T)-1000, (T) + 1000); } else { input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); operation selected_operation; @@ -373,11 +346,14 @@ void run_benchmark(benchmark::State& state, // Warm-up for(size_t i = 0; i < 10; i++) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(operation_kernel), - dim3(grid_size), dim3(BlockSize), 0, stream, - d_input, d_output, selected_operation - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(operation_kernel), + dim3(grid_size), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + selected_operation); } HIP_CHECK(hipDeviceSynchronize()); @@ -389,16 +365,20 @@ void run_benchmark(benchmark::State& state, { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), - dim3(grid_size), dim3(BlockSize), 0, stream, - d_input, d_output, selected_operation - ); + dim3(grid_size), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + selected_operation); } HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -408,26 +388,21 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_output)); } -template -void run_benchmark_memcpy(benchmark::State& state, - size_t size, - const hipStream_t stream) +template +void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream) { std::vector input; if(std::is_floating_point::value) { - input = get_random_data(size, (T)-1000, (T)+1000); + input = get_random_data(size, (T)-1000, (T) + 1000); } else { input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); // Warm-up @@ -449,8 +424,8 @@ void run_benchmark_memcpy(benchmark::State& state, HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -460,25 +435,24 @@ void run_benchmark_memcpy(benchmark::State& state, HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT) \ -benchmark::RegisterBenchmark( \ - (#METHOD "_" #OPERATION "<" #T "," #SIZE ",BS:" #BLOCK_SIZE ",IPT:" #IPT ">"), \ - run_benchmark, SIZE, stream \ -) +#define CREATE_BENCHMARK(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT) \ + benchmark::RegisterBenchmark( \ + (#METHOD "_" #OPERATION "<" #T "," #SIZE ",BS:" #BLOCK_SIZE ",IPT:" #IPT ">"), \ + run_benchmark, \ + SIZE, \ + stream) #define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ -benchmark::RegisterBenchmark( \ - ("Memcpy<" #T "," #SIZE">"), run_benchmark_memcpy, SIZE, stream \ -) + benchmark::RegisterBenchmark( \ + ("Memcpy<" #T "," #SIZE ">"), run_benchmark_memcpy, SIZE, stream) -template +template constexpr unsigned int megabytes(unsigned int size) { - return(size * (1024 * 1024 / sizeof(T))); + return (size * (1024 * 1024 / sizeof(T))); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("trials", "trials", -1, "number of iterations"); @@ -489,9 +463,9 @@ int main(int argc, char *argv[]) const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "Device name: " << devProp.name << std::endl; @@ -499,54 +473,88 @@ int main(int argc, char *argv[]) std::cout << "Shared memory per block: " << devProp.sharedMemPerBlock << std::endl; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { // simple memory copy not running kernel CREATE_BENCHMARK_MEMCPY(int, megabytes(128)), // simple memory copy - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 4), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 8), - - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 1024, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, int, megabytes(128), 1024, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 4), // simple memory copy using vector type CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 1), @@ -662,63 +670,110 @@ int main(int argc, char *argv[]) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 8), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 8), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 4), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 1024, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, float, megabytes(128), 1024, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, double, megabytes(128), 1024, 4), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 4), // vectorized - block_scan CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 1), @@ -808,156 +863,484 @@ int main(int argc, char *argv[]) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 1024, 4), // custom_op - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 4), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 4), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 2), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 4), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 4), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 2), + + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 128, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 128, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 128, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 128, + 8), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 128, + 16), + + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 256, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 256, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 256, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 256, + 8), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 256, + 16), + + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 512, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 512, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 512, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 512, + 8), + + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 1024, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + custom_operation, + uint64_t, + megabytes(128), + 1024, + 2), // block_primitives_transpose - atomics no collision - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 8), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 1), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 2), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 4), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 8), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 1), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 2), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 4), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 8), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 16), + + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 1), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 2), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 4), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 8), + + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 1), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 2), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 4), + CREATE_BENCHMARK( + block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 8), // block_primitives_transpose - atomics inter block collision - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 8), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 128, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 128, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 128, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 128, + 8), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 128, + 16), + + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 256, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 256, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 256, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 256, + 8), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 256, + 16), + + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 512, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 512, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 512, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 512, + 8), + + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 1024, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 1024, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 1024, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_block_collision, + int, + megabytes(128), + 1024, + 8), // block_primitives_transpose - atomics inter warp collision - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 8), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 16), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 8), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 16), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 8), - - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 1), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 2), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 4), - CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 8), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 128, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 128, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 128, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 128, + 8), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 128, + 16), + + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 256, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 256, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 256, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 256, + 8), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 256, + 16), + + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 512, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 512, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 512, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 512, + 8), + + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 1024, + 1), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 1024, + 2), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 1024, + 4), + CREATE_BENCHMARK(block_primitives_transpose, + atomics_inter_warp_collision, + int, + megabytes(128), + 1024, + 8), }; diff --git a/benchmark/benchmark_device_merge.cpp b/benchmark/benchmark_device_merge.cpp index 7b61a746a..9a52b46af 100644 --- a/benchmark/benchmark_device_merge.cpp +++ b/benchmark/benchmark_device_merge.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -55,10 +56,10 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template +template void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; @@ -74,71 +75,70 @@ void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_ std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); - key_type * d_keys_input1; - key_type * d_keys_input2; - key_type * d_keys_output; + key_type* d_keys_input1; + key_type* d_keys_input2; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input1, keys_input1.data(), - size1 * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_keys_input2, keys_input2.data(), - size2 * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy( + d_keys_input1, keys_input1.data(), size1 * sizeof(key_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_keys_input2, keys_input2.data(), size2 * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::merge( - d_temporary_storage, temporary_storage_bytes, - d_keys_input1, d_keys_input2, d_keys_output, size1, size2, - compare_op, stream, false - ) - ); + HIP_CHECK(rp::merge(d_temporary_storage, + temporary_storage_bytes, + d_keys_input1, + d_keys_input2, + d_keys_output, + size1, + size2, + compare_op, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::merge( - d_temporary_storage, temporary_storage_bytes, - d_keys_input1, d_keys_input2, d_keys_output, size1, size2, - compare_op, stream, false - ) - ); + HIP_CHECK(rp::merge(d_temporary_storage, + temporary_storage_bytes, + d_keys_input1, + d_keys_input2, + d_keys_output, + size1, + size2, + compare_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::merge( - d_temporary_storage, temporary_storage_bytes, - d_keys_input1, d_keys_input2, d_keys_output, size1, size2, - compare_op, stream, false - ) - ); + HIP_CHECK(rp::merge(d_temporary_storage, + temporary_storage_bytes, + d_keys_input1, + d_keys_input2, + d_keys_output, + size1, + size2, + compare_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -150,10 +150,10 @@ void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_ HIP_CHECK(hipFree(d_keys_output)); } -template +template void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { - using key_type = Key; + using key_type = Key; using value_type = Value; const size_t size1 = size / 2; @@ -171,44 +171,38 @@ void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size std::iota(values_input1.begin(), values_input1.end(), 0); std::iota(values_input2.begin(), values_input2.end(), size1); - key_type * d_keys_input1; - key_type * d_keys_input2; - key_type * d_keys_output; - value_type * d_values_input1; - value_type * d_values_input2; - value_type * d_values_output; + key_type* d_keys_input1; + key_type* d_keys_input2; + key_type* d_keys_output; + value_type* d_values_input1; + value_type* d_values_input2; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_values_input1, size1 * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_input2, size2 * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input1, keys_input1.data(), - size1 * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_keys_input2, keys_input2.data(), - size2 * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy( + d_keys_input1, keys_input1.data(), size1 * sizeof(key_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_keys_input2, keys_input2.data(), size2 * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::merge( - d_temporary_storage, temporary_storage_bytes, - d_keys_input1, d_keys_input2, d_keys_output, - d_values_input1, d_values_input2, d_values_output, - size1, size2, - compare_op, stream, false - ) - ); + HIP_CHECK(rp::merge(d_temporary_storage, + temporary_storage_bytes, + d_keys_input1, + d_keys_input2, + d_keys_output, + d_values_input1, + d_values_input2, + d_values_output, + size1, + size2, + compare_op, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -216,42 +210,51 @@ void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::merge( - d_temporary_storage, temporary_storage_bytes, - d_keys_input1, d_keys_input2, d_keys_output, - d_values_input1, d_values_input2, d_values_output, - size1, size2, - compare_op, stream, false - ) - ); + HIP_CHECK(rp::merge(d_temporary_storage, + temporary_storage_bytes, + d_keys_input1, + d_keys_input2, + d_keys_output, + d_values_input1, + d_values_input2, + d_values_output, + size1, + size2, + compare_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::merge( - d_temporary_storage, temporary_storage_bytes, - d_keys_input1, d_keys_input2, d_keys_output, - d_values_input1, d_values_input2, d_values_output, - size1, size2, - compare_op, stream, false - ) - ); + HIP_CHECK(rp::merge(d_temporary_storage, + temporary_storage_bytes, + d_keys_input1, + d_keys_input2, + d_keys_output, + d_values_input1, + d_values_input2, + d_values_output, + size1, + size2, + compare_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -263,19 +266,18 @@ void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_MERGE_KEYS_BENCHMARK(Key) \ -benchmark::RegisterBenchmark( \ - (std::string("merge") + "<" #Key ">").c_str(), \ - [=](benchmark::State& state) { run_merge_keys_benchmark(state, stream, size); } \ -) +#define CREATE_MERGE_KEYS_BENCHMARK(Key) \ + benchmark::RegisterBenchmark( \ + (std::string("merge") + "<" #Key ">").c_str(), \ + [=](benchmark::State& state) { run_merge_keys_benchmark(state, stream, size); }) -#define CREATE_MERGE_PAIRS_BENCHMARK(Key, Value) \ -benchmark::RegisterBenchmark( \ - (std::string("merge") + "<" #Key ", " #Value ">").c_str(), \ - [=](benchmark::State& state) { run_merge_pairs_benchmark(state, stream, size); } \ -) +#define CREATE_MERGE_PAIRS_BENCHMARK(Key, Value) \ + benchmark::RegisterBenchmark((std::string("merge") + "<" #Key ", " #Value ">").c_str(), \ + [=](benchmark::State& state) { \ + run_merge_pairs_benchmark(state, stream, size); \ + }) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -284,23 +286,22 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_int2 = custom_type; + using custom_int2 = custom_type; using custom_double2 = custom_type; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { CREATE_MERGE_KEYS_BENCHMARK(int), CREATE_MERGE_KEYS_BENCHMARK(long long), CREATE_MERGE_KEYS_BENCHMARK(char), diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index 1fc5f0bd6..5236a16f4 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -55,10 +56,10 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template +template void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; @@ -67,40 +68,33 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t std::vector keys_input; if(std::is_floating_point::value) { - keys_input = get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input = get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::merge_sort( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - lesser_op, stream, false - ) - ); + HIP_CHECK(rp::merge_sort(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + lesser_op, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -108,35 +102,37 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::merge_sort( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - lesser_op, stream, false - ) - ); + HIP_CHECK(rp::merge_sort(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + lesser_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::merge_sort( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - lesser_op, stream, false - ) - ); + HIP_CHECK(rp::merge_sort(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + lesser_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -147,65 +143,55 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t HIP_CHECK(hipFree(d_keys_output)); } -template +template void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { - using key_type = Key; + using key_type = Key; using value_type = Value; // Generate data std::vector keys_input; if(std::is_floating_point::value) { - keys_input = get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input = get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::merge_sort( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - lesser_op, stream, false - ) - ); + HIP_CHECK(rp::merge_sort(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + lesser_op, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -213,40 +199,45 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_ // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::merge_sort( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - lesser_op, stream, false - ) - ); + HIP_CHECK(rp::merge_sort(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + lesser_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::merge_sort( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - lesser_op, stream, false - ) - ); + HIP_CHECK(rp::merge_sort(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + lesser_op, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -256,18 +247,16 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_ HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ -) +#define CREATE_SORT_KEYS_BENCHMARK(Key) \ + benchmark::RegisterBenchmark( \ + (std::string("sort_keys") + "<" #Key ">").c_str(), \ + [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); }) void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { CREATE_SORT_KEYS_BENCHMARK(int), CREATE_SORT_KEYS_BENCHMARK(long long), @@ -277,21 +266,20 @@ void add_sort_keys_benchmarks(std::vector& benc benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ -) +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ + benchmark::RegisterBenchmark((std::string("sort_pairs") + "<" #Key ", " #Value ">").c_str(), \ + [=](benchmark::State& state) { \ + run_sort_pairs_benchmark(state, stream, size); \ + }) void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; - std::vector bs = - { + std::vector bs = { CREATE_SORT_PAIRS_BENCHMARK(int, float), CREATE_SORT_PAIRS_BENCHMARK(long long, double), @@ -305,7 +293,7 @@ void add_sort_pairs_benchmarks(std::vector& ben benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -314,13 +302,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index 4a1e015c1..a93818db7 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -20,48 +20,49 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include #include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ + exit(error); \ + } \ } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template +template void run_flagged_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float true_probability) + float true_probability) { size = (size * sizeof(int)) / sizeof(T); - std::vector input; + std::vector input; std::vector flags = get_random_data01(size, true_probability); if(std::is_floating_point::value) { @@ -70,69 +71,51 @@ void run_flagged_benchmark(benchmark::State& state, else { input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - T * d_input; - FlagType * d_flags; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + FlagType* d_flags; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(FlagType), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - rocprim::partition( - nullptr, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ); + rocprim::partition(nullptr, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - rocprim::partition( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ); + rocprim::partition(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -142,22 +125,20 @@ void run_flagged_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - rocprim::partition( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ); + rocprim::partition(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -170,67 +151,57 @@ void run_flagged_benchmark(benchmark::State& state, hipFree(d_temp_storage); } -template +template void run_if_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float true_probability) + float true_probability) { - auto select_op = [true_probability] __device__ (const T& value) -> bool - { - if(value < T(10000 * true_probability)) return true; + auto select_op = [true_probability] __device__(const T& value) -> bool { + if(value < T(10000 * true_probability)) + return true; return false; }; std::vector input = get_random_data(size, T(0), T(10000)); - T * d_input; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - rocprim::partition( - nullptr, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ); + rocprim::partition(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - rocprim::partition( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ); + rocprim::partition(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -240,22 +211,20 @@ void run_if_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - rocprim::partition( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ); + rocprim::partition(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -267,19 +236,22 @@ void run_if_benchmark(benchmark::State& state, hipFree(d_temp_storage); } -#define CREATE_PARTITION_FLAGGED_BENCHMARK(T, F, p) \ -benchmark::RegisterBenchmark( \ - ("partition(flags)<" #T "," #F ", "#T", unsigned int>(p = " #p")"), \ - run_flagged_benchmark, size, stream, p \ -) - -#define CREATE_PARTITION_IF_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("partition(if)<" #T ", "#T", unsigned int>(p = " #p")"), \ - run_if_benchmark, size, stream, p \ -) - -int main(int argc, char *argv[]) +#define CREATE_PARTITION_FLAGGED_BENCHMARK(T, F, p) \ + benchmark::RegisterBenchmark( \ + ("partition(flags)<" #T "," #F ", " #T ", unsigned int>(p = " #p ")"), \ + run_flagged_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_PARTITION_IF_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark(("partition(if)<" #T ", " #T ", unsigned int>(p = " #p ")"), \ + run_if_benchmark, \ + size, \ + stream, \ + p) + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -288,23 +260,22 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_double2 = custom_type; + using custom_double2 = custom_type; using custom_int_double = custom_type; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { CREATE_PARTITION_FLAGGED_BENCHMARK(int, unsigned char, 0.75f), CREATE_PARTITION_FLAGGED_BENCHMARK(int, unsigned char, 0.5f), CREATE_PARTITION_FLAGGED_BENCHMARK(int, unsigned char, 0.25f), diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 672754386..068eccd94 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,14 +40,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -55,10 +56,10 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template +template void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; @@ -67,40 +68,32 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t std::vector keys_input; if(std::is_floating_point::value) { - keys_input = get_random_data(size, (key_type)-1000, (key_type)+1000, size); + keys_input = get_random_data(size, (key_type)-1000, (key_type) + 1000, size); } else { keys_input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size - ); + size, std::numeric_limits::min(), std::numeric_limits::max(), size); } - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + 0, + sizeof(key_type) * 8, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -108,37 +101,39 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -149,65 +144,54 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t HIP_CHECK(hipFree(d_keys_output)); } -template +template void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { - using key_type = Key; + using key_type = Key; using value_type = Value; // Generate data std::vector keys_input; if(std::is_floating_point::value) { - keys_input = get_random_data(size, (key_type)-1000, (key_type)+1000, size); + keys_input = get_random_data(size, (key_type)-1000, (key_type) + 1000, size); } else { keys_input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size - ); + size, std::numeric_limits::min(), std::numeric_limits::max(), size); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + 0, + sizeof(key_type) * 8, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -215,42 +199,47 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_ // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -260,18 +249,16 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_ HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ -) +#define CREATE_SORT_KEYS_BENCHMARK(Key) \ + benchmark::RegisterBenchmark( \ + (std::string("sort_keys") + "<" #Key ">").c_str(), \ + [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); }) void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { CREATE_SORT_KEYS_BENCHMARK(int), CREATE_SORT_KEYS_BENCHMARK(long long), @@ -281,21 +268,20 @@ void add_sort_keys_benchmarks(std::vector& benc benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ -) +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ + benchmark::RegisterBenchmark((std::string("sort_pairs") + "<" #Key ", " #Value ">").c_str(), \ + [=](benchmark::State& state) { \ + run_sort_pairs_benchmark(state, stream, size); \ + }) void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; - std::vector bs = - { + std::vector bs = { CREATE_SORT_PAIRS_BENCHMARK(int, float), CREATE_SORT_PAIRS_BENCHMARK(int, double), CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2), @@ -309,7 +295,7 @@ void add_sort_pairs_benchmarks(std::vector& ben benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -318,13 +304,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index 7fdd658f7..593738ba2 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include +#include +#include #include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" @@ -38,73 +38,61 @@ #include // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template< - class T, - class BinaryFunction -> +template void run_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - BinaryFunction reduce_op) + BinaryFunction reduce_op) { std::vector input = get_random_data(size, T(0), T(1000)); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, T(), size, - reduce_op, stream - ) - ); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); + HIP_CHECK(rocprim::reduce( + d_temp_storage, temp_storage_size_bytes, d_input, d_output, T(), size, reduce_op, stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, T(), size, - reduce_op, stream - ) - ); + HIP_CHECK(rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + T(), + size, + reduce_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -114,19 +102,20 @@ void run_benchmark(benchmark::State& state, for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, T(), size, - reduce_op, stream - ) - ); + HIP_CHECK(rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + T(), + size, + reduce_op, + stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -137,13 +126,14 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_BENCHMARK(T, REDUCE_OP) \ -benchmark::RegisterBenchmark( \ - ("reduce<" #T ", " #REDUCE_OP ">"), \ - run_benchmark, size, stream, REDUCE_OP() \ -) +#define CREATE_BENCHMARK(T, REDUCE_OP) \ + benchmark::RegisterBenchmark(("reduce<" #T ", " #REDUCE_OP ">"), \ + run_benchmark, \ + size, \ + stream, \ + REDUCE_OP()) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -152,23 +142,22 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { CREATE_BENCHMARK(int, rocprim::plus), CREATE_BENCHMARK(long long, rocprim::plus), diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index 7b5759000..d0d6e12ac 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -20,18 +20,18 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include +#include +#include #include #include -#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -39,14 +39,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -54,25 +55,25 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template +template void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { - using key_type = Key; + using key_type = Key; using value_type = Value; // Generate data std::vector keys_input(size); - unsigned int unique_count = 0; - std::vector key_counts = get_random_data(100000, 1, max_length); - size_t offset = 0; + unsigned int unique_count = 0; + std::vector key_counts = get_random_data(100000, 1, max_length); + size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[unique_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { keys_input[i] = unique_count; @@ -85,49 +86,40 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; + key_type* d_keys_input; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - value_type * d_aggregates_output; - unsigned int * d_unique_count_output; + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); + + key_type* d_unique_output; + value_type* d_aggregates_output; + unsigned int* d_unique_count_output; HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - rp::plus reduce_op; + rp::plus reduce_op; rp::equal_to key_compare_op; - HIP_CHECK( - rp::reduce_by_key( - nullptr, temporary_storage_bytes, - d_keys_input, d_values_input, size, - d_unique_output, d_aggregates_output, - d_unique_count_output, - reduce_op, key_compare_op, - stream - ) - ); + HIP_CHECK(rp::reduce_by_key(nullptr, + temporary_storage_bytes, + d_keys_input, + d_values_input, + size, + d_unique_output, + d_aggregates_output, + d_unique_count_output, + reduce_op, + key_compare_op, + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -135,44 +127,47 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::reduce_by_key( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, size, - d_unique_output, d_aggregates_output, - d_unique_count_output, - reduce_op, key_compare_op, - stream - ) - ); + HIP_CHECK(rp::reduce_by_key(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + size, + d_unique_output, + d_aggregates_output, + d_unique_count_output, + reduce_op, + key_compare_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::reduce_by_key( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, size, - d_unique_output, d_aggregates_output, - d_unique_count_output, - reduce_op, key_compare_op, - stream - ) - ); + HIP_CHECK(rp::reduce_by_key(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + size, + d_unique_output, + d_aggregates_output, + d_unique_count_output, + reduce_op, + key_compare_op, + stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -183,25 +178,24 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea HIP_CHECK(hipFree(d_unique_count_output)); } -#define CREATE_BENCHMARK(Key, Value) \ -benchmark::RegisterBenchmark( \ - (std::string("reduce_by_key") + "<" #Key ", " #Value ">" + \ - "([1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - run_benchmark, \ - max_length, stream, size \ -) +#define CREATE_BENCHMARK(Key, Value) \ + benchmark::RegisterBenchmark((std::string("reduce_by_key") + "<" #Key ", " #Value ">" \ + + "([1, " + std::to_string(max_length) + "])") \ + .c_str(), \ + run_benchmark, \ + max_length, \ + stream, \ + size) -void add_benchmarks(size_t max_length, +void add_benchmarks(size_t max_length, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; - std::vector bs = - { + std::vector bs = { CREATE_BENCHMARK(int, float), CREATE_BENCHMARK(int, double), CREATE_BENCHMARK(int, custom_float2), @@ -216,7 +210,7 @@ void add_benchmarks(size_t max_length, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -225,13 +219,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index f121215f7..ed3bf7deb 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -20,18 +20,18 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include +#include +#include #include #include -#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -39,14 +39,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -54,22 +55,25 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -template -void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) +template +void run_encode_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size) { - using key_type = T; + using key_type = T; using count_type = unsigned int; // Generate data std::vector input(size); - unsigned int runs_count = 0; + unsigned int runs_count = 0; std::vector key_counts = get_random_data(100000, 1, max_length); - size_t offset = 0; + size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; @@ -79,34 +83,29 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ offset += key_count; } - key_type * d_input; + key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - count_type * d_counts_output; - count_type * d_runs_count_output; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + key_type* d_unique_output; + count_type* d_counts_output; + count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::run_length_encode( - nullptr, temporary_storage_bytes, - d_input, size, - d_unique_output, d_counts_output, d_runs_count_output, - stream, false - ) - ); + HIP_CHECK(rp::run_length_encode(nullptr, + temporary_storage_bytes, + d_input, + size, + d_unique_output, + d_counts_output, + d_runs_count_output, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -114,36 +113,40 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ // Warm-up for(size_t i = 0; i < 10; i++) { - HIP_CHECK( - rp::run_length_encode( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_unique_output, d_counts_output, d_runs_count_output, - stream, false - ) - ); + HIP_CHECK(rp::run_length_encode(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_unique_output, + d_counts_output, + d_runs_count_output, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - rp::run_length_encode( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_unique_output, d_counts_output, d_runs_count_output, - stream, false - ); + rp::run_length_encode(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_unique_output, + d_counts_output, + d_runs_count_output, + stream, + false); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -156,23 +159,26 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ HIP_CHECK(hipFree(d_runs_count_output)); } -template -void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) +template +void run_non_trivial_runs_benchmark(benchmark::State& state, + size_t max_length, + hipStream_t stream, + size_t size) { - using key_type = T; + using key_type = T; using offset_type = unsigned int; - using count_type = unsigned int; + using count_type = unsigned int; // Generate data std::vector input(size); - unsigned int runs_count = 0; + unsigned int runs_count = 0; std::vector key_counts = get_random_data(100000, 1, max_length); - size_t offset = 0; + size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; - const size_t end = std::min(size, offset + key_count); + const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; @@ -182,34 +188,29 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, offset += key_count; } - key_type * d_input; + key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets_output; - count_type * d_counts_output; - count_type * d_runs_count_output; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets_output; + count_type* d_counts_output; + count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::run_length_encode_non_trivial_runs( - nullptr, temporary_storage_bytes, - d_input, size, - d_offsets_output, d_counts_output, d_runs_count_output, - stream, false - ) - ); + HIP_CHECK(rp::run_length_encode_non_trivial_runs(nullptr, + temporary_storage_bytes, + d_input, + size, + d_offsets_output, + d_counts_output, + d_runs_count_output, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -217,36 +218,40 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, // Warm-up for(size_t i = 0; i < 10; i++) { - HIP_CHECK( - rp::run_length_encode_non_trivial_runs( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_offsets_output, d_counts_output, d_runs_count_output, - stream, false - ) - ); + HIP_CHECK(rp::run_length_encode_non_trivial_runs(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_offsets_output, + d_counts_output, + d_runs_count_output, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - rp::run_length_encode_non_trivial_runs( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - d_offsets_output, d_counts_output, d_runs_count_output, - stream, false - ); + rp::run_length_encode_non_trivial_runs(d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + d_offsets_output, + d_counts_output, + d_runs_count_output, + stream, + false); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -259,25 +264,24 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, HIP_CHECK(hipFree(d_runs_count_output)); } -#define CREATE_ENCODE_BENCHMARK(T) \ -benchmark::RegisterBenchmark( \ - (std::string("run_length_encode") + "<" #T ">" + \ - "([1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - run_encode_benchmark, \ - max_length, stream, size \ -) +#define CREATE_ENCODE_BENCHMARK(T) \ + benchmark::RegisterBenchmark((std::string("run_length_encode") + "<" #T ">" + "([1, " \ + + std::to_string(max_length) + "])") \ + .c_str(), \ + run_encode_benchmark, \ + max_length, \ + stream, \ + size) -void add_encode_benchmarks(size_t max_length, +void add_encode_benchmarks(size_t max_length, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; - std::vector bs = - { + std::vector bs = { CREATE_ENCODE_BENCHMARK(int), CREATE_ENCODE_BENCHMARK(long long), @@ -288,25 +292,24 @@ void add_encode_benchmarks(size_t max_length, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ -benchmark::RegisterBenchmark( \ - (std::string("run_length_encode_non_trivial_runs") + "<" #T ">" + \ - "([1, " + std::to_string(max_length) + "])" \ - ).c_str(), \ - run_non_trivial_runs_benchmark, \ - max_length, stream, size \ -) +#define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ + benchmark::RegisterBenchmark((std::string("run_length_encode_non_trivial_runs") + "<" #T ">" \ + + "([1, " + std::to_string(max_length) + "])") \ + .c_str(), \ + run_non_trivial_runs_benchmark, \ + max_length, \ + stream, \ + size) -void add_non_trivial_runs_benchmarks(size_t max_length, +void add_non_trivial_runs_benchmarks(size_t max_length, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; - std::vector bs = - { + std::vector bs = { CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), @@ -317,7 +320,7 @@ void add_non_trivial_runs_benchmarks(size_t max_length, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -326,13 +329,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index 8616f09db..03e363b74 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" @@ -40,127 +40,110 @@ #include "benchmark_utils.hpp" -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -template< - bool Exclusive, - class T, - class BinaryFunction -> -auto run_device_scan(void * temporary_storage, - size_t& storage_size, - T * input, - T * output, - const T initial_value, - const size_t input_size, - BinaryFunction scan_op, +template +auto run_device_scan(void* temporary_storage, + size_t& storage_size, + T* input, + T* output, + const T initial_value, + const size_t input_size, + BinaryFunction scan_op, const hipStream_t stream, - const bool debug = false) - -> typename std::enable_if::type + const bool debug = false) -> + typename std::enable_if::type { - return rocprim::exclusive_scan( - temporary_storage, storage_size, - input, output, initial_value, input_size, - scan_op, stream, debug - ); + return rocprim::exclusive_scan(temporary_storage, + storage_size, + input, + output, + initial_value, + input_size, + scan_op, + stream, + debug); } -template< - bool Exclusive, - class T, - class BinaryFunction -> -auto run_device_scan(void * temporary_storage, - size_t& storage_size, - T * input, - T * output, - const T initial_value, - const size_t input_size, - BinaryFunction scan_op, +template +auto run_device_scan(void* temporary_storage, + size_t& storage_size, + T* input, + T* output, + const T initial_value, + const size_t input_size, + BinaryFunction scan_op, const hipStream_t stream, - const bool debug = false) - -> typename std::enable_if::type + const bool debug = false) -> + typename std::enable_if::type { - (void) initial_value; + (void)initial_value; return rocprim::inclusive_scan( - temporary_storage, storage_size, - input, output, input_size, - scan_op, stream, debug - ); + temporary_storage, storage_size, input, output, input_size, scan_op, stream, debug); } -template< - bool Exclusive, - class T, - class BinaryFunction -> +template void run_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - BinaryFunction scan_op) + BinaryFunction scan_op) { std::vector input; if(std::is_floating_point::value) { - input = get_random_data(size, (T)-1000, (T)+1000); + input = get_random_data(size, (T)-1000, (T) + 1000); } else { input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - T initial_value = get_random_value((T)-1000, (T)+1000); - T * d_input; - T * d_output; + T initial_value = get_random_value((T)-1000, (T) + 1000); + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - ); - HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); + HIP_CHECK(run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream)); + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - HIP_CHECK( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - ); + HIP_CHECK(run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -170,19 +153,20 @@ void run_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - run_device_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, initial_value, size, - scan_op, stream - ) - ); + HIP_CHECK(run_device_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + initial_value, + size, + scan_op, + stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -193,19 +177,21 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_temp_storage)); } -#define CREATE_INCLUSIVE_BENCHMARK(T, SCAN_OP) \ -benchmark::RegisterBenchmark( \ - ("inclusive_scan<" #T "," #SCAN_OP ">"), \ - run_benchmark, size, stream, SCAN_OP() \ -) +#define CREATE_INCLUSIVE_BENCHMARK(T, SCAN_OP) \ + benchmark::RegisterBenchmark(("inclusive_scan<" #T "," #SCAN_OP ">"), \ + run_benchmark, \ + size, \ + stream, \ + SCAN_OP()) -#define CREATE_EXCLUSIVE_BENCHMARK(T, SCAN_OP) \ -benchmark::RegisterBenchmark( \ - ("exclusive_scan<" #T "," #SCAN_OP ">"), \ - run_benchmark, size, stream, SCAN_OP() \ -) +#define CREATE_EXCLUSIVE_BENCHMARK(T, SCAN_OP) \ + benchmark::RegisterBenchmark(("exclusive_scan<" #T "," #SCAN_OP ">"), \ + run_benchmark, \ + size, \ + stream, \ + SCAN_OP()) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -214,40 +200,38 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = custom_type; - using custom_int2 = custom_type; + using custom_int2 = custom_type; // Add benchmarks - std::vector benchmarks = - { - CREATE_INCLUSIVE_BENCHMARK(int, rocprim::plus), - CREATE_EXCLUSIVE_BENCHMARK(int, rocprim::plus), + std::vector benchmarks + = {CREATE_INCLUSIVE_BENCHMARK(int, rocprim::plus), + CREATE_EXCLUSIVE_BENCHMARK(int, rocprim::plus), - CREATE_INCLUSIVE_BENCHMARK(float, rocprim::plus), - CREATE_EXCLUSIVE_BENCHMARK(float, rocprim::plus), + CREATE_INCLUSIVE_BENCHMARK(float, rocprim::plus), + CREATE_EXCLUSIVE_BENCHMARK(float, rocprim::plus), - CREATE_INCLUSIVE_BENCHMARK(double, rocprim::plus), - CREATE_EXCLUSIVE_BENCHMARK(double, rocprim::plus), + CREATE_INCLUSIVE_BENCHMARK(double, rocprim::plus), + CREATE_EXCLUSIVE_BENCHMARK(double, rocprim::plus), - CREATE_INCLUSIVE_BENCHMARK(long long, rocprim::plus), - CREATE_EXCLUSIVE_BENCHMARK(long long, rocprim::plus), + CREATE_INCLUSIVE_BENCHMARK(long long, rocprim::plus), + CREATE_EXCLUSIVE_BENCHMARK(long long, rocprim::plus), - CREATE_INCLUSIVE_BENCHMARK(custom_double2, rocprim::plus), - CREATE_EXCLUSIVE_BENCHMARK(custom_double2, rocprim::plus), - CREATE_INCLUSIVE_BENCHMARK(custom_int2, rocprim::plus), - CREATE_EXCLUSIVE_BENCHMARK(custom_int2, rocprim::plus) - }; + CREATE_INCLUSIVE_BENCHMARK(custom_double2, rocprim::plus), + CREATE_EXCLUSIVE_BENCHMARK(custom_double2, rocprim::plus), + CREATE_INCLUSIVE_BENCHMARK(custom_int2, rocprim::plus), + CREATE_EXCLUSIVE_BENCHMARK(custom_int2, rocprim::plus)}; // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 785443fde..8b8c02107 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -20,18 +20,18 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include +#include +#include #include #include -#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -39,14 +39,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -54,29 +55,30 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -const unsigned int batch_size = 4; +const unsigned int batch_size = 4; const unsigned int warmup_size = 2; -template +template void run_sort_keys_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, size_t size) + size_t desired_segments, + hipStream_t stream, + size_t size) { using offset_type = int; - using key_type = Key; + using key_type = Key; // Generate data std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; - size_t offset = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); @@ -89,50 +91,42 @@ void run_sort_keys_benchmark(benchmark::State& state, std::vector keys_input; if(std::is_floating_point::value) { - keys_input = get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input = get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - offset_type * d_offsets; + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::segmented_radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - segments_count, d_offsets, d_offsets + 1, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -140,39 +134,45 @@ void run_sort_keys_benchmark(benchmark::State& state, // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::segmented_radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - segments_count, d_offsets, d_offsets + 1, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::segmented_radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - segments_count, d_offsets, d_offsets + 1, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); @@ -184,27 +184,28 @@ void run_sort_keys_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_keys_output)); } -template +template void run_sort_pairs_benchmark(benchmark::State& state, - size_t desired_segments, - hipStream_t stream, size_t size) + size_t desired_segments, + hipStream_t stream, + size_t size) { using offset_type = int; - using key_type = Key; - using value_type = Value; + using key_type = Key; + using value_type = Value; // Generate data std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; - size_t offset = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); @@ -217,65 +218,54 @@ void run_sort_pairs_benchmark(benchmark::State& state, std::vector keys_input; if(std::is_floating_point::value) { - keys_input = get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input = get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - offset_type * d_offsets; + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_keys_input; - key_type * d_keys_output; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - void * d_temporary_storage = nullptr; + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); + + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::segmented_radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream, + false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -283,44 +273,53 @@ void run_sort_pairs_benchmark(benchmark::State& state, // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::segmented_radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::segmented_radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, - 0, sizeof(key_type) * 8, - stream, false - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + 0, + sizeof(key_type) * 8, + stream, + false)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } - state.SetBytesProcessed( - state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) - ); + state.SetBytesProcessed(state.iterations() * batch_size * size + * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); @@ -331,20 +330,19 @@ void run_sort_pairs_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_values_output)); } -#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size); } \ -) +#define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ + benchmark::RegisterBenchmark( \ + (std::string("sort_keys") + "<" #Key ">" + "(~" + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_sort_keys_benchmark(state, SEGMENTS, stream, size); \ + }) void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { + std::vector bs = { CREATE_SORT_KEYS_BENCHMARK(int, 1), CREATE_SORT_KEYS_BENCHMARK(int, 10), CREATE_SORT_KEYS_BENCHMARK(int, 100), @@ -364,23 +362,23 @@ void add_sort_keys_benchmarks(std::vector& benc benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); } \ -) +#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ + benchmark::RegisterBenchmark((std::string("sort_pairs") + "<" #Key ", " #Value ">" + "(~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + [=](benchmark::State& state) { \ + run_sort_pairs_benchmark( \ + state, SEGMENTS, stream, size); \ + }) void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; - std::vector bs = - { + std::vector bs = { CREATE_SORT_PAIRS_BENCHMARK(int, float, 1), CREATE_SORT_PAIRS_BENCHMARK(int, float, 10), CREATE_SORT_PAIRS_BENCHMARK(int, float, 100), @@ -416,7 +414,7 @@ void add_sort_pairs_benchmarks(std::vector& ben benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -425,13 +423,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index 80ec54dca..65fe180cb 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -20,18 +20,18 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include +#include +#include #include #include -#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -39,14 +39,15 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -54,25 +55,28 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size) +template +void run_benchmark(benchmark::State& state, + size_t desired_segments, + hipStream_t stream, + size_t size) { using offset_type = int; - using value_type = T; + using value_type = T; // Generate data - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); const double avg_segment_length = static_cast(size) / desired_segments; std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); @@ -85,45 +89,37 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - offset_type * d_offsets; + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + value_type* d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_aggregates_output; + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); + + value_type* d_aggregates_output; HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(value_type))); rocprim::plus reduce_op; - value_type init(0); + value_type init(0); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::segmented_reduce( - d_temporary_storage, temporary_storage_bytes, - d_values_input, d_aggregates_output, - segments_count, - d_offsets, d_offsets + 1, - reduce_op, init, - stream - ) - ); + HIP_CHECK(rp::segmented_reduce(d_temporary_storage, + temporary_storage_bytes, + d_values_input, + d_aggregates_output, + segments_count, + d_offsets, + d_offsets + 1, + reduce_op, + init, + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); @@ -131,41 +127,41 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rp::segmented_reduce( - d_temporary_storage, temporary_storage_bytes, - d_values_input, d_aggregates_output, - segments_count, - d_offsets, d_offsets + 1, - reduce_op, init, - stream - ) - ); + HIP_CHECK(rp::segmented_reduce(d_temporary_storage, + temporary_storage_bytes, + d_values_input, + d_aggregates_output, + segments_count, + d_offsets, + d_offsets + 1, + reduce_op, + init, + stream)); } HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rp::segmented_reduce( - d_temporary_storage, temporary_storage_bytes, - d_values_input, d_aggregates_output, - segments_count, - d_offsets, d_offsets + 1, - reduce_op, init, - stream - ) - ); + HIP_CHECK(rp::segmented_reduce(d_temporary_storage, + temporary_storage_bytes, + d_values_input, + d_aggregates_output, + segments_count, + d_offsets, + d_offsets + 1, + reduce_op, + init, + stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); @@ -177,24 +173,23 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t HIP_CHECK(hipFree(d_aggregates_output)); } -#define CREATE_BENCHMARK(T, SEGMENTS) \ -benchmark::RegisterBenchmark( \ - (std::string("segmented_reduce") + "<" #T ">" + \ - "(~" + std::to_string(SEGMENTS) + " segments)" \ - ).c_str(), \ - run_benchmark, \ - SEGMENTS, stream, size \ -) +#define CREATE_BENCHMARK(T, SEGMENTS) \ + benchmark::RegisterBenchmark((std::string("segmented_reduce") + "<" #T ">" + "(~" \ + + std::to_string(SEGMENTS) + " segments)") \ + .c_str(), \ + run_benchmark, \ + SEGMENTS, \ + stream, \ + size) void add_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; - std::vector bs = - { + std::vector bs = { CREATE_BENCHMARK(float, 1), CREATE_BENCHMARK(float, 10), CREATE_BENCHMARK(float, 100), @@ -227,7 +222,7 @@ void add_benchmarks(std::vector& benchmarks, benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -236,13 +231,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index 4418fd9ec..387e06874 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -20,47 +20,48 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include #include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ + exit(error); \ + } \ } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template +template void run_flagged_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float true_probability) + float true_probability) { - std::vector input; - std::vector flags = get_random_data01(size, true_probability); + std::vector input; + std::vector flags = get_random_data01(size, true_probability); std::vector selected_count_output(1); if(std::is_floating_point::value) { @@ -69,69 +70,51 @@ void run_flagged_benchmark(benchmark::State& state, else { input = get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - T * d_input; - FlagType * d_flags; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + FlagType* d_flags; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(FlagType), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - rocprim::select( - nullptr, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ); + rocprim::select(nullptr, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - rocprim::select( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ); + rocprim::select(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -141,22 +124,20 @@ void run_flagged_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - rocprim::select( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - d_output, - d_selected_count_output, - input.size(), - stream - ); + rocprim::select(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + d_output, + d_selected_count_output, + input.size(), + stream); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -170,70 +151,60 @@ void run_flagged_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); } -template +template void run_selectop_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float true_probability) + float true_probability) { - std::vector input = get_random_data(size, T(0), T(1000)); + std::vector input = get_random_data(size, T(0), T(1000)); std::vector selected_count_output(1); - auto select_op = [true_probability] __device__ (const T& value) -> bool - { - if(value < T(1000 * true_probability)) return true; + auto select_op = [true_probability] __device__(const T& value) -> bool { + if(value < T(1000 * true_probability)) + return true; return false; }; - T * d_input; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - rocprim::select( - nullptr, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ); + rocprim::select(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - rocprim::select( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ); + rocprim::select(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -243,22 +214,20 @@ void run_selectop_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - rocprim::select( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - select_op, - stream - ); + rocprim::select(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + select_op, + stream); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -271,74 +240,64 @@ void run_selectop_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); } -template +template void run_unique_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - float discontinuity_probability) + float discontinuity_probability) { std::vector input(size); { auto input01 = get_random_data01(size, discontinuity_probability); - auto acc = input01[0]; - input[0] = acc; + auto acc = input01[0]; + input[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input[i] = acc + input01[i]; } } std::vector selected_count_output(1); - auto equality_op = rocprim::equal_to(); + auto equality_op = rocprim::equal_to(); - T * d_input; - T * d_output; - unsigned int * d_selected_count_output; + T* d_input; + T* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage - rocprim::unique( - nullptr, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - equality_op, - stream - ); + rocprim::unique(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + equality_op, + stream); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { - rocprim::unique( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - equality_op, - stream - ); + rocprim::unique(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + equality_op, + stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -348,22 +307,20 @@ void run_unique_benchmark(benchmark::State& state, auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { - rocprim::unique( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_output, - d_selected_count_output, - input.size(), - equality_op, - stream - ); + rocprim::unique(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_selected_count_output, + input.size(), + equality_op, + stream); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -375,25 +332,29 @@ void run_unique_benchmark(benchmark::State& state, hipFree(d_temp_storage); } -#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ -benchmark::RegisterBenchmark( \ - ("select_flagged<" #T "," #F ", "#T", unsigned int>(p = " #p")"), \ - run_flagged_benchmark, size, stream, p \ -) - -#define CREATE_SELECT_IF_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("select_if<" #T ", "#T", unsigned int>(p = " #p")"), \ - run_selectop_benchmark, size, stream, p \ -) - -#define CREATE_UNIQUE_BENCHMARK(T, p) \ -benchmark::RegisterBenchmark( \ - ("unique<" #T ", "#T", unsigned int>(p = " #p")"), \ - run_unique_benchmark, size, stream, p \ -) - -int main(int argc, char *argv[]) +#define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ + benchmark::RegisterBenchmark( \ + ("select_flagged<" #T "," #F ", " #T ", unsigned int>(p = " #p ")"), \ + run_flagged_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_SELECT_IF_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark(("select_if<" #T ", " #T ", unsigned int>(p = " #p ")"), \ + run_selectop_benchmark, \ + size, \ + stream, \ + p) + +#define CREATE_UNIQUE_BENCHMARK(T, p) \ + benchmark::RegisterBenchmark(("unique<" #T ", " #T ", unsigned int>(p = " #p ")"), \ + run_unique_benchmark, \ + size, \ + stream, \ + p) + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -402,59 +363,57 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_double2 = custom_type; + using custom_double2 = custom_type; using custom_int_double = custom_type; // Add benchmarks - std::vector benchmarks = - { - CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.95f), - CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.75f), - CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.5f), - CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.25f), - CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.10f), - - CREATE_SELECT_FLAGGED_BENCHMARK(float, unsigned char, 0.5f), - CREATE_SELECT_FLAGGED_BENCHMARK(double, unsigned char, 0.5f), - CREATE_SELECT_FLAGGED_BENCHMARK(custom_double2, unsigned char, 0.5f), - CREATE_SELECT_FLAGGED_BENCHMARK(custom_int_double, unsigned char, 0.5f), - - CREATE_SELECT_IF_BENCHMARK(int, 0.95f), - CREATE_SELECT_IF_BENCHMARK(int, 0.75f), - CREATE_SELECT_IF_BENCHMARK(int, 0.5f), - CREATE_SELECT_IF_BENCHMARK(int, 0.25f), - CREATE_SELECT_IF_BENCHMARK(int, 0.10f), - - CREATE_SELECT_IF_BENCHMARK(unsigned char, 0.5f), - CREATE_SELECT_IF_BENCHMARK(float, 0.5f), - CREATE_SELECT_IF_BENCHMARK(double, 0.5f), - CREATE_SELECT_IF_BENCHMARK(custom_double2, 0.5f), - CREATE_SELECT_IF_BENCHMARK(custom_int_double, 0.5f), - - CREATE_UNIQUE_BENCHMARK(int, 0.75f), - CREATE_UNIQUE_BENCHMARK(int, 0.5f), - CREATE_UNIQUE_BENCHMARK(int, 0.1f), - CREATE_UNIQUE_BENCHMARK(int, 0.05f), - CREATE_UNIQUE_BENCHMARK(int, 0.01f), - CREATE_UNIQUE_BENCHMARK(int, 0.005f), - - CREATE_UNIQUE_BENCHMARK(unsigned char, 0.1f), - CREATE_UNIQUE_BENCHMARK(float, 0.1f), - CREATE_UNIQUE_BENCHMARK(double, 0.1f), - CREATE_UNIQUE_BENCHMARK(custom_double2, 0.1f), - CREATE_UNIQUE_BENCHMARK(custom_int_double, 0.1f) - }; + std::vector benchmarks + = {CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.95f), + CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.75f), + CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.5f), + CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.25f), + CREATE_SELECT_FLAGGED_BENCHMARK(int, unsigned char, 0.10f), + + CREATE_SELECT_FLAGGED_BENCHMARK(float, unsigned char, 0.5f), + CREATE_SELECT_FLAGGED_BENCHMARK(double, unsigned char, 0.5f), + CREATE_SELECT_FLAGGED_BENCHMARK(custom_double2, unsigned char, 0.5f), + CREATE_SELECT_FLAGGED_BENCHMARK(custom_int_double, unsigned char, 0.5f), + + CREATE_SELECT_IF_BENCHMARK(int, 0.95f), + CREATE_SELECT_IF_BENCHMARK(int, 0.75f), + CREATE_SELECT_IF_BENCHMARK(int, 0.5f), + CREATE_SELECT_IF_BENCHMARK(int, 0.25f), + CREATE_SELECT_IF_BENCHMARK(int, 0.10f), + + CREATE_SELECT_IF_BENCHMARK(unsigned char, 0.5f), + CREATE_SELECT_IF_BENCHMARK(float, 0.5f), + CREATE_SELECT_IF_BENCHMARK(double, 0.5f), + CREATE_SELECT_IF_BENCHMARK(custom_double2, 0.5f), + CREATE_SELECT_IF_BENCHMARK(custom_int_double, 0.5f), + + CREATE_UNIQUE_BENCHMARK(int, 0.75f), + CREATE_UNIQUE_BENCHMARK(int, 0.5f), + CREATE_UNIQUE_BENCHMARK(int, 0.1f), + CREATE_UNIQUE_BENCHMARK(int, 0.05f), + CREATE_UNIQUE_BENCHMARK(int, 0.01f), + CREATE_UNIQUE_BENCHMARK(int, 0.005f), + + CREATE_UNIQUE_BENCHMARK(unsigned char, 0.1f), + CREATE_UNIQUE_BENCHMARK(float, 0.1f), + CREATE_UNIQUE_BENCHMARK(double, 0.1f), + CREATE_UNIQUE_BENCHMARK(custom_double2, 0.1f), + CREATE_UNIQUE_BENCHMARK(custom_int_double, 0.1f)}; // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_device_transform.cpp b/benchmark/benchmark_device_transform.cpp index 6d00d450d..5e7fc2cb0 100644 --- a/benchmark/benchmark_device_transform.cpp +++ b/benchmark/benchmark_device_transform.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,65 +40,51 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif -const unsigned int batch_size = 10; +const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template +template struct transform { - __device__ __host__ - constexpr T operator()(const T& a) const + __device__ __host__ constexpr T operator()(const T& a) const { return a + T(5); } }; -template< - class T, - class BinaryFunction -> +template void run_benchmark(benchmark::State& state, - size_t size, + size_t size, const hipStream_t stream, - BinaryFunction transform_op) + BinaryFunction transform_op) { std::vector input = get_random_data(size, T(0), T(1000)); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK( - rocprim::transform( - d_input, d_output, size, - transform_op, stream - ) - ); + HIP_CHECK(rocprim::transform(d_input, d_output, size, transform_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -108,18 +94,13 @@ void run_benchmark(benchmark::State& state, for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK( - rocprim::transform( - d_input, d_output, size, - transform_op, stream - ) - ); + HIP_CHECK(rocprim::transform(d_input, d_output, size, transform_op, stream)); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); @@ -129,13 +110,14 @@ void run_benchmark(benchmark::State& state, HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, TRANSFORM_OP) \ -benchmark::RegisterBenchmark( \ - ("transform<" #T ", " #TRANSFORM_OP ">"), \ - run_benchmark, size, stream, TRANSFORM_OP() \ -) +#define CREATE_BENCHMARK(T, TRANSFORM_OP) \ + benchmark::RegisterBenchmark(("transform<" #T ", " #TRANSFORM_OP ">"), \ + run_benchmark, \ + size, \ + stream, \ + TRANSFORM_OP()) -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -144,23 +126,22 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_float2 = custom_type; + using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks - std::vector benchmarks = - { + std::vector benchmarks = { CREATE_BENCHMARK(int, transform), CREATE_BENCHMARK(long long, transform), diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 9979ecd17..a5ef068b0 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -22,26 +22,25 @@ #define ROCPRIM_BENCHMARK_UTILS_HPP_ #include -#include #include #include +#include #include // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type +template +inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); + std::random_device rd; + std::default_random_engine gen(rd()); std::uniform_int_distribution distribution(min, max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { + return distribution(gen); + }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); @@ -49,18 +48,17 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = return data; } -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type +template +inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); + std::random_device rd; + std::default_random_engine gen(rd()); std::uniform_real_distribution distribution(min, max); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { + return distribution(gen); + }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); @@ -68,17 +66,16 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = return data; } -template +template inline std::vector get_random_data01(size_t size, float p, size_t max_random_size = 1024 * 1024) { - std::random_device rd; - std::default_random_engine gen(rd()); + std::random_device rd; + std::default_random_engine gen(rd()); std::bernoulli_distribution distribution(p); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { + return distribution(gen); + }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); @@ -86,63 +83,64 @@ inline std::vector get_random_data01(size_t size, float p, size_t max_random_ return data; } -template +template inline T get_random_value(T min, T max) { return get_random_data(1, min, max)[0]; } -template +template struct custom_type { - using first_type = T; + using first_type = T; using second_type = U; T x; U y; - ROCPRIM_HOST_DEVICE inline - custom_type(T xx = 0, U yy = 0) : x(xx), y(yy) + ROCPRIM_HOST_DEVICE inline custom_type(T xx = 0, U yy = 0) + : x(xx) + , y(yy) { } - ROCPRIM_HOST_DEVICE inline - ~custom_type() = default; + ROCPRIM_HOST_DEVICE inline ~custom_type() = default; - ROCPRIM_HOST_DEVICE inline - custom_type operator+(const custom_type& rhs) const + ROCPRIM_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const { return custom_type(x + rhs.x, y + rhs.y); } - ROCPRIM_HOST_DEVICE inline - bool operator<(const custom_type& rhs) const + ROCPRIM_HOST_DEVICE inline bool operator<(const custom_type& rhs) const { return (x < rhs.x || (x == rhs.x && y < rhs.y)); } - ROCPRIM_HOST_DEVICE inline - bool operator==(const custom_type& rhs) const + ROCPRIM_HOST_DEVICE inline bool operator==(const custom_type& rhs) const { return x == rhs.x && y == rhs.y; } }; -template -struct is_custom_type : std::false_type {}; +template +struct is_custom_type : std::false_type +{ +}; -template -struct is_custom_type> : std::true_type {}; +template +struct is_custom_type> : std::true_type +{ +}; -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) - -> typename std::enable_if::value, std::vector>::type +template +inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> + typename std::enable_if::value, std::vector>::type { - using first_type = typename T::first_type; + using first_type = typename T::first_type; using second_type = typename T::second_type; std::vector data(size); - auto fdata = get_random_data(size, min.x, max.x, max_random_size); - auto sdata = get_random_data(size, min.y, max.y, max_random_size); + auto fdata = get_random_data(size, min.x, max.x, max_random_size); + auto sdata = get_random_data(size, min.y, max.y, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(fdata[i], sdata[i]); diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp index 5246a4c61..edee09a6f 100644 --- a/benchmark/benchmark_warp_reduce.cpp +++ b/benchmark/benchmark_warp_reduce.cpp @@ -20,19 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser -#include "cmdparser.hpp" #include "benchmark_utils.hpp" +#include "cmdparser.hpp" // HIP API #include @@ -40,27 +40,22 @@ // rocPRIM #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif -template< - bool AllReduce, - class T, - unsigned int WarpSize, - unsigned int Trials -> -__global__ -void warp_reduce_kernel(const T * d_input, T * d_output) +template +__global__ void warp_reduce_kernel(const T* d_input, T* d_output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -68,7 +63,7 @@ void warp_reduce_kernel(const T * d_input, T * d_output) using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().reduce(value, value, storage); @@ -77,23 +72,17 @@ void warp_reduce_kernel(const T * d_input, T * d_output) d_output[i] = value; } -template< - class T, - class Flag, - unsigned int WarpSize, - unsigned int Trials -> -__global__ -void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) +template +__global__ void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = d_input[i]; - auto flag = d_flags[i]; + auto flag = d_flags[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().head_segmented_reduce(value, value, flag, storage); @@ -102,99 +91,83 @@ void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) d_output[i] = value; } -template< - bool AllReduce, - bool Segmented, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials, - class T, - class Flag -> -inline -auto execute_warp_reduce_kernel(T* input, T* output, Flag* /* flags */, - size_t size, hipStream_t stream) - -> typename std::enable_if::type +template +inline auto execute_warp_reduce_kernel( + T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> + typename std::enable_if::type { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_reduce_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - input, output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + input, + output); HIP_CHECK(hipPeekAtLastError()); } -template< - bool AllReduce, - bool Segmented, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials, - class T, - class Flag -> -inline -auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, - size_t size, hipStream_t stream) - -> typename std::enable_if::type +template +inline auto + execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> + typename std::enable_if::type { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(segmented_warp_reduce_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - input, flags, output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_warp_reduce_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + input, + flags, + output); HIP_CHECK(hipPeekAtLastError()); } -template< - bool AllReduce, - bool Segmented, - class T, - unsigned int WarpSize, - unsigned int BlockSize, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { using flag_type = unsigned char; - const auto size = BlockSize * ((N + BlockSize - 1)/BlockSize); + const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize); - std::vector input = get_random_data(size, T(0), T(10)); + std::vector input = get_random_data(size, T(0), T(10)); std::vector flags = get_random_data(size, 0, 1); - T * d_input; - flag_type * d_flags; - T * d_output; + T* d_input; + flag_type* d_flags; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - size * sizeof(flag_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); execute_warp_reduce_kernel( - d_input, d_output, d_flags, size, stream - ); + d_input, d_output, d_flags, size, stream); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); @@ -205,33 +178,30 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) HIP_CHECK(hipFree(d_flags)); } -#define CREATE_BENCHMARK(T, WS, BS) \ -benchmark::RegisterBenchmark( \ - (std::string("warp_reduce<" #T ", " #WS ", " #BS ">.") + name).c_str(), \ - run_benchmark, \ - stream, size \ -) +#define CREATE_BENCHMARK(T, WS, BS) \ + benchmark::RegisterBenchmark( \ + (std::string("warp_reduce<" #T ", " #WS ", " #BS ">.") + name).c_str(), \ + run_benchmark, \ + stream, \ + size) -template -void add_benchmarks(const std::string& name, +template +void add_benchmarks(const std::string& name, std::vector& benchmarks, - hipStream_t stream, - size_t size) + hipStream_t stream, + size_t size) { - std::vector bs = - { - CREATE_BENCHMARK(int, 32, 64), - CREATE_BENCHMARK(int, 64, 64), - CREATE_BENCHMARK(int, 37, 64), - CREATE_BENCHMARK(int, 61, 64), - CREATE_BENCHMARK(double, 64, 64), - CREATE_BENCHMARK(double, 61, 64) - }; + std::vector bs = {CREATE_BENCHMARK(int, 32, 64), + CREATE_BENCHMARK(int, 64, 64), + CREATE_BENCHMARK(int, 37, 64), + CREATE_BENCHMARK(int, 61, 64), + CREATE_BENCHMARK(double, 64, 64), + CREATE_BENCHMARK(double, 61, 64)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -240,13 +210,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index 84a078739..ab91f208a 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" @@ -39,14 +39,15 @@ #include "benchmark_utils.hpp" -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -54,16 +55,15 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -template -__global__ -void warp_inclusive_scan_kernel(const T* input, T* output) +template +__global__ void warp_inclusive_scan_kernel(const T* input, T* output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().inclusive_scan(value, value, storage); @@ -72,16 +72,15 @@ void warp_inclusive_scan_kernel(const T* input, T* output) output[i] = value; } -template -__global__ -void warp_exclusive_scan_kernel(const T* input, T* output, const T init) +template +__global__ void warp_exclusive_scan_kernel(const T* input, T* output, const T init) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto value = input[i]; + const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().exclusive_scan(value, value, init, storage); @@ -90,57 +89,54 @@ void warp_exclusive_scan_kernel(const T* input, T* output, const T init) output[i] = value; } -template< - class T, - unsigned int BlockSize, - unsigned int WarpSize, - bool Inclusive = true, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { // Make sure size is a multiple of BlockSize - size = BlockSize * ((size + BlockSize - 1)/BlockSize); + size = BlockSize * ((size + BlockSize - 1) / BlockSize); // Allocate and fill memory std::vector input(size, 1.0f); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - for (auto _ : state) + for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(Inclusive) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_inclusive_scan_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - d_input, d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_inclusive_scan_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + d_input, + d_output); } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_exclusive_scan_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - d_input, d_output, input[0] - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_exclusive_scan_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + d_input, + d_output, + input[0]); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } @@ -151,49 +147,47 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) HIP_CHECK(hipFree(d_output)); } -#define CREATE_BENCHMARK(T, BS, WS, INCLUSIVE) \ - benchmark::RegisterBenchmark( \ - (std::string("warp_scan<"#T", "#BS", "#WS">.") + method_name).c_str(), \ - run_benchmark, \ - stream, size \ - ) +#define CREATE_BENCHMARK(T, BS, WS, INCLUSIVE) \ + benchmark::RegisterBenchmark( \ + (std::string("warp_scan<" #T ", " #BS ", " #WS ">.") + method_name).c_str(), \ + run_benchmark, \ + stream, \ + size) -template +template void add_benchmarks(std::vector& benchmarks, - const std::string& method_name, - hipStream_t stream, - size_t size) + const std::string& method_name, + hipStream_t stream, + size_t size) { - using custom_double2 = custom_type; + using custom_double2 = custom_type; using custom_int_double = custom_type; - std::vector new_benchmarks = - { - CREATE_BENCHMARK(float, 64, 64, Inclusive), - CREATE_BENCHMARK(float, 128, 64, Inclusive), - CREATE_BENCHMARK(float, 256, 64, Inclusive), - CREATE_BENCHMARK(float, 256, 32, Inclusive), - CREATE_BENCHMARK(float, 256, 16, Inclusive), - // force using shared memory version - CREATE_BENCHMARK(float, 63, 63, Inclusive), - CREATE_BENCHMARK(float, 62, 31, Inclusive), - CREATE_BENCHMARK(float, 60, 15, Inclusive), - - CREATE_BENCHMARK(int, 64, 64, Inclusive), - CREATE_BENCHMARK(int, 128, 64, Inclusive), - CREATE_BENCHMARK(int, 256, 64, Inclusive), - - CREATE_BENCHMARK(double, 64, 64, Inclusive), - CREATE_BENCHMARK(double, 128, 64, Inclusive), - CREATE_BENCHMARK(double, 256, 64, Inclusive), - - CREATE_BENCHMARK(custom_double2, 64, 64, Inclusive), - CREATE_BENCHMARK(custom_int_double, 64, 64, Inclusive) - }; + std::vector new_benchmarks + = {CREATE_BENCHMARK(float, 64, 64, Inclusive), + CREATE_BENCHMARK(float, 128, 64, Inclusive), + CREATE_BENCHMARK(float, 256, 64, Inclusive), + CREATE_BENCHMARK(float, 256, 32, Inclusive), + CREATE_BENCHMARK(float, 256, 16, Inclusive), + // force using shared memory version + CREATE_BENCHMARK(float, 63, 63, Inclusive), + CREATE_BENCHMARK(float, 62, 31, Inclusive), + CREATE_BENCHMARK(float, 60, 15, Inclusive), + + CREATE_BENCHMARK(int, 64, 64, Inclusive), + CREATE_BENCHMARK(int, 128, 64, Inclusive), + CREATE_BENCHMARK(int, 256, 64, Inclusive), + + CREATE_BENCHMARK(double, 64, 64, Inclusive), + CREATE_BENCHMARK(double, 128, 64, Inclusive), + CREATE_BENCHMARK(double, 256, 64, Inclusive), + + CREATE_BENCHMARK(custom_double2, 64, 64, Inclusive), + CREATE_BENCHMARK(custom_int_double, 64, 64, Inclusive)}; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -202,13 +196,13 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; diff --git a/benchmark/benchmark_warp_sort.cpp b/benchmark/benchmark_warp_sort.cpp index 379eaf73a..438770a2b 100644 --- a/benchmark/benchmark_warp_sort.cpp +++ b/benchmark/benchmark_warp_sort.cpp @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include #include -#include -#include -#include #include #include +#include +#include +#include +#include // Google Benchmark #include "benchmark/benchmark.h" @@ -39,14 +39,15 @@ #include "benchmark_utils.hpp" -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -54,15 +55,14 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; namespace rp = rocprim; -template -__global__ -void warp_sort_kernel(K* input_key) +template +__global__ void warp_sort_kernel(K* input_key) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto key = input_key[i]; + auto key = input_key[i]; rp::warp_sort wsort; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wsort.sort(key); @@ -70,58 +70,47 @@ void warp_sort_kernel(K* input_key) input_key[i] = key; } -template -__global__ -void warp_sort_by_key_kernel(K* input_key, V* input_value) +template +__global__ void warp_sort_by_key_kernel(K* input_key, V* input_value) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto key = input_key[i]; - auto value = input_value[i]; + auto key = input_key[i]; + auto value = input_value[i]; rp::warp_sort wsort; - #pragma nounroll +#pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wsort.sort(key, value); } - input_key[i] = key; + input_key[i] = key; input_value[i] = value; } -template< - class Key, - unsigned int BlockSize, - unsigned int WarpSize, - class Value = Key, - bool SortByKey = false, - unsigned int Trials = 100 -> +template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { // Make sure size is a multiple of BlockSize - size = BlockSize * ((size + BlockSize - 1)/BlockSize); + size = BlockSize * ((size + BlockSize - 1) / BlockSize); // Allocate and fill memory - std::vector input_key = get_random_data(size, Key(0), Key(10000)); + std::vector input_key = get_random_data(size, Key(0), Key(10000)); std::vector input_value(size_t(1)); - if(SortByKey) input_value = get_random_data(size, Value(0), Value(10000)); - Key * d_input_key = nullptr; - Value * d_input_value = nullptr; + if(SortByKey) + input_value = get_random_data(size, Value(0), Value(10000)); + Key* d_input_key = nullptr; + Value* d_input_value = nullptr; HIP_CHECK(hipMalloc(&d_input_key, size * sizeof(Key))); - if(SortByKey) HIP_CHECK(hipMalloc(&d_input_value, size * sizeof(Value))); - HIP_CHECK( - hipMemcpy( - d_input_key, input_key.data(), - size * sizeof(Key), - hipMemcpyHostToDevice - ) - ); - if(SortByKey) HIP_CHECK( - hipMemcpy( - d_input_value, input_value.data(), - size * sizeof(Value), - hipMemcpyHostToDevice - ) - ); + if(SortByKey) + HIP_CHECK(hipMalloc(&d_input_value, size * sizeof(Value))); + HIP_CHECK(hipMemcpy(d_input_key, input_key.data(), size * sizeof(Key), hipMemcpyHostToDevice)); + if(SortByKey) + HIP_CHECK(hipMemcpy( + d_input_value, input_value.data(), size * sizeof(Value), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) @@ -131,29 +120,34 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_sort_by_key_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - d_input_key, d_input_value - ); + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + d_input_key, + d_input_value); } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_sort_kernel), - dim3(size/BlockSize), dim3(BlockSize), 0, stream, - d_input_key - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_sort_kernel), + dim3(size / BlockSize), + dim3(BlockSize), + 0, + stream, + d_input_key); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); - auto elapsed_seconds = - std::chrono::duration_cast>(end - start); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } // SortByKey also transfers values auto sorted_type_size = sizeof(Key); - if(SortByKey) sorted_type_size += sizeof(Value); + if(SortByKey) + sorted_type_size += sizeof(Value); state.SetBytesProcessed(state.iterations() * size * sorted_type_size * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); @@ -161,21 +155,19 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) HIP_CHECK(hipFree(d_input_value)); } -#define CREATE_SORT_BENCHMARK(K, BS, WS) \ - benchmark::RegisterBenchmark( \ - "warp_sort<"#K", "#BS", "#WS">.sort(only keys)", \ - run_benchmark, \ - stream, size \ - ) - -#define CREATE_SORTBYKEY_BENCHMARK(K, V, BS, WS) \ - benchmark::RegisterBenchmark( \ - "warp_sort<"#K", "#BS", "#WS", "#V">.sort", \ - run_benchmark, \ - stream, size \ - ) - -int main(int argc, char *argv[]) +#define CREATE_SORT_BENCHMARK(K, BS, WS) \ + benchmark::RegisterBenchmark("warp_sort<" #K ", " #BS ", " #WS ">.sort(only keys)", \ + run_benchmark, \ + stream, \ + size) + +#define CREATE_SORTBYKEY_BENCHMARK(K, V, BS, WS) \ + benchmark::RegisterBenchmark("warp_sort<" #K ", " #BS ", " #WS ", " #V ">.sort", \ + run_benchmark, \ + stream, \ + size) + +int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); @@ -184,21 +176,20 @@ int main(int argc, char *argv[]) // Parse argv benchmark::Initialize(&argc, argv); - const size_t size = parser.get("size"); - const int trials = parser.get("trials"); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); // HIP - hipStream_t stream = 0; // default + hipStream_t stream = 0; // default hipDeviceProp_t devProp; - int device_id = 0; + int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; - using custom_double2 = custom_type; - using custom_int_double = custom_type; - std::vector benchmarks = - { + using custom_double2 = custom_type; + using custom_int_double = custom_type; + std::vector benchmarks = { // key type, block size, warp size CREATE_SORT_BENCHMARK(float, 64, 64), CREATE_SORT_BENCHMARK(float, 128, 64), @@ -209,7 +200,7 @@ int main(int argc, char *argv[]) CREATE_SORT_BENCHMARK(int, 64, 64), CREATE_SORT_BENCHMARK(double, 64, 64), CREATE_SORT_BENCHMARK(custom_double2, 64, 64), - CREATE_SORT_BENCHMARK(custom_int_double , 64, 64), + CREATE_SORT_BENCHMARK(custom_int_double, 64, 64), // key type, value type, block size, warp size CREATE_SORTBYKEY_BENCHMARK(float, float, 64, 64), diff --git a/benchmark/cmdparser.hpp b/benchmark/cmdparser.hpp index 5ffc24f62..823f3990b 100644 --- a/benchmark/cmdparser.hpp +++ b/benchmark/cmdparser.hpp @@ -26,186 +26,240 @@ */ #pragma once +#include #include +#include #include #include #include -#include -#include -namespace cli { - struct CallbackArgs { +namespace cli +{ + struct CallbackArgs + { const std::vector& arguments; - std::ostream& output; - std::ostream& error; + std::ostream& output; + std::ostream& error; }; - class Parser { + class Parser + { private: - class CmdBase { + class CmdBase + { public: - explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) : - name(name), - command(name.size() > 0 ? "-" + name : ""), - alternative(alternative.size() > 0 ? "--" + alternative : ""), - description(description), - required(required), - handled(false), - arguments({}), - dominant(dominant), - variadic(variadic) { + explicit CmdBase(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant, + bool variadic) + : name(name) + , command(name.size() > 0 ? "-" + name : "") + , alternative(alternative.size() > 0 ? "--" + alternative : "") + , description(description) + , required(required) + , handled(false) + , arguments({}) + , dominant(dominant) + , variadic(variadic) + { } - virtual ~CmdBase() { - } + virtual ~CmdBase() {} - std::string name; - std::string command; - std::string alternative; - std::string description; - bool required; - bool handled; + std::string name; + std::string command; + std::string alternative; + std::string description; + bool required; + bool handled; std::vector arguments; - bool const dominant; - bool const variadic; + bool const dominant; + bool const variadic; - virtual std::string print_value() const = 0; - virtual bool parse(std::ostream& output, std::ostream& error) = 0; + virtual std::string print_value() const = 0; + virtual bool parse(std::ostream& output, std::ostream& error) = 0; - bool is(const std::string& given) const { + bool is(const std::string& given) const + { return given == command || given == alternative; } }; - template + template struct ArgumentCountChecker { static constexpr bool Variadic = false; }; - template + template struct ArgumentCountChecker> { static constexpr bool Variadic = true; }; - template - class CmdFunction final : public CmdBase { + template + class CmdFunction final : public CmdBase + { public: - explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : - CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { + explicit CmdFunction(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant) + : CmdBase(name, + alternative, + description, + required, + dominant, + ArgumentCountChecker::Variadic) + { } - virtual bool parse(std::ostream& output, std::ostream& error) { - try { - CallbackArgs args { arguments, output, error }; + virtual bool parse(std::ostream& output, std::ostream& error) + { + try + { + CallbackArgs args {arguments, output, error}; value = callback(args); return true; - } catch (...) { + } + catch(...) + { return false; } } - virtual std::string print_value() const { + virtual std::string print_value() const + { return ""; } std::function callback; - T value; + T value; }; - template - class CmdArgument final : public CmdBase { + template + class CmdArgument final : public CmdBase + { public: - explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : - CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { + explicit CmdArgument(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant) + : CmdBase(name, + alternative, + description, + required, + dominant, + ArgumentCountChecker::Variadic) + { } - virtual bool parse(std::ostream&, std::ostream&) { - try { + virtual bool parse(std::ostream&, std::ostream&) + { + try + { value = Parser::parse(arguments, value); return true; - } catch (...) { + } + catch(...) + { return false; } } - virtual std::string print_value() const { + virtual std::string print_value() const + { return stringify(value); } T value; }; - static int parse(const std::vector& elements, const int&) { - if (elements.size() != 1) + static int parse(const std::vector& elements, const int&) + { + if(elements.size() != 1) throw std::bad_cast(); return std::stoi(elements[0]); } - static bool parse(const std::vector& elements, const bool& defval) { - if (elements.size() != 0) - throw std::runtime_error("A boolean command line parameter cannot have any arguments."); + static bool parse(const std::vector& elements, const bool& defval) + { + if(elements.size() != 0) + throw std::runtime_error( + "A boolean command line parameter cannot have any arguments."); return !defval; } - static double parse(const std::vector& elements, const double&) { - if (elements.size() != 1) + static double parse(const std::vector& elements, const double&) + { + if(elements.size() != 1) throw std::bad_cast(); return std::stod(elements[0]); } - static float parse(const std::vector& elements, const float&) { - if (elements.size() != 1) + static float parse(const std::vector& elements, const float&) + { + if(elements.size() != 1) throw std::bad_cast(); return std::stof(elements[0]); } - static long double parse(const std::vector& elements, const long double&) { - if (elements.size() != 1) + static long double parse(const std::vector& elements, const long double&) + { + if(elements.size() != 1) throw std::bad_cast(); return std::stold(elements[0]); } - static unsigned int parse(const std::vector& elements, const unsigned int&) { - if (elements.size() != 1) + static unsigned int parse(const std::vector& elements, const unsigned int&) + { + if(elements.size() != 1) throw std::bad_cast(); return static_cast(std::stoul(elements[0])); } - static unsigned long parse(const std::vector& elements, const unsigned long&) { - if (elements.size() != 1) + static unsigned long parse(const std::vector& elements, const unsigned long&) + { + if(elements.size() != 1) throw std::bad_cast(); return std::stoul(elements[0]); } - static long parse(const std::vector& elements, const long&) { - if (elements.size() != 1) + static long parse(const std::vector& elements, const long&) + { + if(elements.size() != 1) throw std::bad_cast(); return std::stol(elements[0]); } - static std::string parse(const std::vector& elements, const std::string&) { - if (elements.size() != 1) + static std::string parse(const std::vector& elements, const std::string&) + { + if(elements.size() != 1) throw std::bad_cast(); return elements[0]; } - template - static std::vector parse(const std::vector& elements, const std::vector&) { - const T defval = T(); - std::vector values { }; + template + static std::vector parse(const std::vector& elements, const std::vector&) + { + const T defval = T(); + std::vector values {}; std::vector buffer(1); - for (const auto& element : elements) { + for(const auto& element : elements) + { buffer[0] = element; values.push_back(parse(buffer, defval)); } @@ -213,17 +267,20 @@ namespace cli { return values; } - template - static std::string stringify(const T& value) { + template + static std::string stringify(const T& value) + { return std::to_string(value); } - template - static std::string stringify(const std::vector& values) { - std::stringstream ss { }; + template + static std::string stringify(const std::vector& values) + { + std::stringstream ss {}; ss << "[ "; - for (const auto& value : values) { + for(const auto& value : values) + { ss << stringify(value) << " "; } @@ -231,36 +288,46 @@ namespace cli { return ss.str(); } - static std::string stringify(const std::string& str) { + static std::string stringify(const std::string& str) + { return str; } public: - explicit Parser(int argc, const char** argv) : - _appname(argv[0]) { - for (int i = 1; i < argc; ++i) { + explicit Parser(int argc, const char** argv) + : _appname(argv[0]) + { + for(int i = 1; i < argc; ++i) + { _arguments.push_back(argv[i]); } enable_help(); } - explicit Parser(int argc, char** argv) : - _appname(argv[0]) { - for (int i = 1; i < argc; ++i) { + explicit Parser(int argc, char** argv) + : _appname(argv[0]) + { + for(int i = 1; i < argc; ++i) + { _arguments.push_back(argv[i]); } enable_help(); } - ~Parser() { - for (int i = 0, n = _commands.size(); i < n; ++i) { + ~Parser() + { + for(int i = 0, n = _commands.size(); i < n; ++i) + { delete _commands[i]; } } - bool has_help() const { - for (const auto command : _commands) { - if (command->name == "h" && command->alternative == "--help") { + bool has_help() const + { + for(const auto command : _commands) + { + if(command->name == "h" && command->alternative == "--help") + { return true; } } @@ -268,83 +335,118 @@ namespace cli { return false; } - void enable_help() { - set_callback("h", "help", std::function([this](CallbackArgs& args){ - args.output << this->usage(); - /*exit(0);*/ - return false; - }), "", true); - } - - void disable_help() { - for (auto command = _commands.begin(); command != _commands.end(); ++command) { - if ((*command)->name == "h" && (*command)->alternative == "--help") { + void enable_help() + { + set_callback("h", + "help", + std::function([this](CallbackArgs& args) { + args.output << this->usage(); + /*exit(0);*/ + return false; + }), + "", + true); + } + + void disable_help() + { + for(auto command = _commands.begin(); command != _commands.end(); ++command) + { + if((*command)->name == "h" && (*command)->alternative == "--help") + { _commands.erase(command); break; } } } - template - void set_default(bool is_required, const std::string& description = "") { - auto command = new CmdArgument { "", "", description, is_required, false }; + template + void set_default(bool is_required, const std::string& description = "") + { + auto command = new CmdArgument {"", "", description, is_required, false}; _commands.push_back(command); } - template - void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) { - auto command = new CmdArgument { name, alternative, description, true, dominant }; + template + void set_required(const std::string& name, + const std::string& alternative, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdArgument {name, alternative, description, true, dominant}; _commands.push_back(command); } - template - void set_optional(const std::string& name, const std::string& alternative, T defaultValue, const std::string& description = "", bool dominant = false) { - auto command = new CmdArgument { name, alternative, description, false, dominant }; + template + void set_optional(const std::string& name, + const std::string& alternative, + T defaultValue, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdArgument {name, alternative, description, false, dominant}; command->value = defaultValue; _commands.push_back(command); } - template - void set_callback(const std::string& name, const std::string& alternative, std::function callback, const std::string& description = "", bool dominant = false) { - auto command = new CmdFunction { name, alternative, description, false, dominant }; + template + void set_callback(const std::string& name, + const std::string& alternative, + std::function callback, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdFunction {name, alternative, description, false, dominant}; command->callback = callback; _commands.push_back(command); } - inline void run_and_exit_if_error() { - if (run() == false) { + inline void run_and_exit_if_error() + { + if(run() == false) + { exit(1); } } - inline bool run() { + inline bool run() + { return run(std::cout, std::cerr); } - inline bool run(std::ostream& output) { + inline bool run(std::ostream& output) + { return run(output, std::cerr); } - bool run(std::ostream& output, std::ostream& error) { - if (_arguments.size() > 0) { + bool run(std::ostream& output, std::ostream& error) + { + if(_arguments.size() > 0) + { auto current = find_default(); - for (int i = 0, n = _arguments.size(); i < n; ++i) { - auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; + for(int i = 0, n = _arguments.size(); i < n; ++i) + { + auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; auto associated = isarg ? find(_arguments[i]) : nullptr; - if (associated != nullptr) { - current = associated; + if(associated != nullptr) + { + current = associated; associated->handled = true; - } else if (current == nullptr) { + } + else if(current == nullptr) + { current = find(_arguments[i]); // Code was commented out so cmdparser can ignore unknown options // error << no_default(); // return false; - } else { + } + else + { current->arguments.push_back(_arguments[i]); current->handled = true; - if (!current->variadic) + if(!current->variadic) { // If the current command is not variadic, then no more arguments // should be added to it. In this case, switch back to the default @@ -357,24 +459,30 @@ namespace cli { // First, parse dominant arguments since they succeed even if required // arguments are missing. - for (auto command : _commands) { - if (command->handled && command->dominant && !command->parse(output, error)) { + for(auto command : _commands) + { + if(command->handled && command->dominant && !command->parse(output, error)) + { error << howto_use(command); return false; } } // Next, check for any missing arguments. - for (auto command : _commands) { - if (command->required && !command->handled) { + for(auto command : _commands) + { + if(command->required && !command->handled) + { error << howto_required(command); return false; } } // Finally, parse all remaining arguments. - for (auto command : _commands) { - if (command->handled && !command->dominant && !command->parse(output, error)) { + for(auto command : _commands) + { + if(command->handled && !command->dominant && !command->parse(output, error)) + { error << howto_use(command); return false; } @@ -383,14 +491,19 @@ namespace cli { return true; } - template - T get(const std::string& name) const { - for (const auto& command : _commands) { - if (command->name == name) { + template + T get(const std::string& name) const + { + for(const auto& command : _commands) + { + if(command->name == name) + { auto cmd = dynamic_cast*>(command); - if (cmd == nullptr) { - throw std::runtime_error("Invalid usage of the parameter " + name + " detected."); + if(cmd == nullptr) + { + throw std::runtime_error("Invalid usage of the parameter " + name + + " detected."); } return cmd->value; @@ -400,17 +513,21 @@ namespace cli { throw std::runtime_error("The parameter " + name + " could not be found."); } - template - T get_if(const std::string& name, std::function callback) const { + template + T get_if(const std::string& name, std::function callback) const + { auto value = get(name); return callback(value); } - int requirements() const { + int requirements() const + { int count = 0; - for (const auto& command : _commands) { - if (command->required) { + for(const auto& command : _commands) + { + if(command->required) + { ++count; } } @@ -418,18 +535,23 @@ namespace cli { return count; } - int commands() const { + int commands() const + { return static_cast(_commands.size()); } - inline const std::string& app_name() const { + inline const std::string& app_name() const + { return _appname; } protected: - CmdBase* find(const std::string& name) { - for (auto command : _commands) { - if (command->is(name)) { + CmdBase* find(const std::string& name) + { + for(auto command : _commands) + { + if(command->is(name)) + { return command; } } @@ -437,9 +559,12 @@ namespace cli { return nullptr; } - CmdBase* find_default() { - for (auto command : _commands) { - if (command->name == "") { + CmdBase* find_default() + { + for(auto command : _commands) + { + if(command->name == "") + { return command; } } @@ -447,21 +572,28 @@ namespace cli { return nullptr; } - std::string usage() const { - std::stringstream ss { }; + std::string usage() const + { + std::stringstream ss {}; ss << "Available parameters:\n\n"; - for (const auto& command : _commands) { + for(const auto& command : _commands) + { ss << " " << command->command << "\t" << command->alternative; - if (command->required == true) { + if(command->required == true) + { ss << "\t(required)"; } ss << "\n " << command->description; - if (command->required == false) { - ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'."; + if(command->required == false) + { + ss << "\n " + << "This parameter is optional. The default value is '" + + command->print_value() + << "'."; } ss << "\n\n"; @@ -470,30 +602,35 @@ namespace cli { return ss.str(); } - void print_help(std::stringstream& ss) const { - if (has_help()) { + void print_help(std::stringstream& ss) const + { + if(has_help()) + { ss << "For more help use --help or -h.\n"; } } - std::string howto_required(CmdBase* command) const { - std::stringstream ss { }; + std::string howto_required(CmdBase* command) const + { + std::stringstream ss {}; ss << "The parameter " << command->name << " is required.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } - std::string howto_use(CmdBase* command) const { - std::stringstream ss { }; + std::string howto_use(CmdBase* command) const + { + std::stringstream ss {}; ss << "The parameter " << command->name << " has invalid arguments.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } - std::string no_default() const { - std::stringstream ss { }; + std::string no_default() const + { + std::stringstream ss {}; ss << "No default parameter has been specified.\n"; ss << "The given argument must be used with a parameter.\n"; print_help(ss); @@ -501,8 +638,8 @@ namespace cli { } private: - const std::string _appname; + const std::string _appname; std::vector _arguments; - std::vector _commands; + std::vector _commands; }; } diff --git a/example/example_temporary_storage.cpp b/example/example_temporary_storage.cpp index e69223480..9d66e894f 100644 --- a/example/example_temporary_storage.cpp +++ b/example/example_temporary_storage.cpp @@ -18,8 +18,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include #include +#include // rocPRIM API #include @@ -28,12 +28,8 @@ // Example with allocating shared memory required as a temporary storage // for a block-level parallel primitive inside a kernel -template< - const unsigned int BlockSize, - class T -> -__global__ -void example_shared_memory(const T *input, T *output) +template +__global__ void example_shared_memory(const T* input, T* output) { // Indexing for this block unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; @@ -48,25 +44,19 @@ void example_shared_memory(const T *input, T *output) // Execute inclusive plus scan input_value = input[index]; - block_scan_type() - .inclusive_scan( - input_value, - output_value, - storage, - rocprim::plus() - ); + block_scan_type().inclusive_scan(input_value, output_value, storage, rocprim::plus()); output[index] = output_value; } // Host function that runs example_shared_memory kernel -template +template void run_example_shared_memory(size_t size) { constexpr unsigned int block_size = 256; // Make sure size is a multiple of block_size unsigned int grid_size = (size + block_size - 1) / block_size; - size = block_size * grid_size; + size = block_size * grid_size; // Generate input on host and copy it to device std::vector host_input = get_random_data(size, 0, 1000); @@ -76,29 +66,30 @@ void run_example_shared_memory(size_t size) std::vector host_output(size); // Device memory allocation - T * device_input; - T * device_output; - HIP_CHECK(hipMalloc(&device_input, host_input.size() * sizeof(typename decltype(host_input)::value_type))); - HIP_CHECK(hipMalloc(&device_output, host_output.size() * sizeof(typename decltype(host_output)::value_type))); + T* device_input; + T* device_output; + HIP_CHECK(hipMalloc(&device_input, + host_input.size() * sizeof(typename decltype(host_input)::value_type))); + HIP_CHECK(hipMalloc(&device_output, + host_output.size() * sizeof(typename decltype(host_output)::value_type))); // Writing input data to device memory hip_write_device_memory(device_input, host_input); // Launching kernel example_shared_memory - hipLaunchKernelGGL( - HIP_KERNEL_NAME(example_shared_memory), - dim3(grid_size), dim3(block_size), - 0, 0, - device_input, device_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(example_shared_memory), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); // Reading output from device hip_read_device_memory(host_output, device_output); // Validating output - OUTPUT_VALIDATION_CHECK( - validate_device_output(host_output, host_expected_output) - ); + OUTPUT_VALIDATION_CHECK(validate_device_output(host_output, host_expected_output)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -107,87 +98,74 @@ void run_example_shared_memory(size_t size) } // Kernel 2 - storage_type for one primitive union'ed with storage_type of other primitive -template< - const unsigned int BlockSize, - const unsigned int ItemsPerThread, - class T -> -__global__ -void example_union_storage_types(const T *input, T *output) +template +__global__ void example_union_storage_types(const T* input, T* output) { // Specialize primitives - using block_scan_type = rocprim::block_scan< - T, BlockSize, rocprim::block_scan_algorithm::using_warp_scan - >; - using block_load_type = rocprim::block_load< - T, BlockSize, ItemsPerThread, rocprim::block_load_method::block_load_transpose - >; - using block_store_type = rocprim::block_store< - T, BlockSize, ItemsPerThread, rocprim::block_store_method::block_store_transpose - >; + using block_scan_type + = rocprim::block_scan; + using block_load_type = rocprim:: + block_load; + using block_store_type + = rocprim::block_store; // Allocate storage in shared memory for both scan and sort operations __shared__ union { - typename block_scan_type::storage_type scan; - typename block_load_type::storage_type load; + typename block_scan_type::storage_type scan; + typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; constexpr int items_per_block = BlockSize * ItemsPerThread; - int block_offset = (hipBlockIdx_x * items_per_block); + int block_offset = (hipBlockIdx_x * items_per_block); // Input/output array for block scan primitive T values[ItemsPerThread]; // Loading data for this thread - block_load_type().load( - input + block_offset, - values, - storage.load - ); + block_load_type().load(input + block_offset, values, storage.load); rocprim::syncthreads(); // Perform scan - block_scan_type() - .inclusive_scan( - values, // as input - values, // as output - storage.scan, - rocprim::plus() - ); + block_scan_type().inclusive_scan(values, // as input + values, // as output + storage.scan, + rocprim::plus()); rocprim::syncthreads(); // Save elements to output - block_store_type().store( - output + block_offset, - values, - storage.store - ); + block_store_type().store(output + block_offset, values, storage.store); } // Host function that runs example_union_storage_types kernel -template +template void run_example_union_storage_types(size_t size) { - constexpr unsigned int block_size = 256; + constexpr unsigned int block_size = 256; constexpr unsigned int items_per_thread = 4; // Make sure size is a multiple of block_size auto grid_size = (size + block_size - 1) / block_size; - size = block_size * grid_size; + size = block_size * grid_size; // Generate input on host and copy it to device std::vector host_input = get_random_data(size, 0, 1000); // Generating expected output for kernel - std::vector host_expected_output = get_expected_output(host_input, block_size, items_per_thread); + std::vector host_expected_output + = get_expected_output(host_input, block_size, items_per_thread); // For reading device output std::vector host_output(size); // Device memory allocation - T * device_input; - T * device_output; - HIP_CHECK(hipMalloc(&device_input, host_input.size() * sizeof(typename decltype(host_input)::value_type))); - HIP_CHECK(hipMalloc(&device_output, host_output.size() * sizeof(typename decltype(host_output)::value_type))); + T* device_input; + T* device_output; + HIP_CHECK(hipMalloc(&device_input, + host_input.size() * sizeof(typename decltype(host_input)::value_type))); + HIP_CHECK(hipMalloc(&device_output, + host_output.size() * sizeof(typename decltype(host_output)::value_type))); // Writing input data to device memory hip_write_device_memory(device_input, host_input); @@ -195,18 +173,18 @@ void run_example_union_storage_types(size_t size) // Launching kernel example_union_storage_types hipLaunchKernelGGL( HIP_KERNEL_NAME(example_union_storage_types), - dim3(grid_size), dim3(block_size), - 0, 0, - device_input, device_output - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); // Reading output from device hip_read_device_memory(host_output, device_output); // Validating output - OUTPUT_VALIDATION_CHECK( - validate_device_output(host_output, host_expected_output) - ); + OUTPUT_VALIDATION_CHECK(validate_device_output(host_output, host_expected_output)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -215,12 +193,8 @@ void run_example_union_storage_types(size_t size) } // Kernel 3 - Allocating shared memory in runtime -template< - const unsigned int BlockSize, - class T -> -__global__ -void example_dynamic_shared_memory(const T *input, T *output) +template +__global__ void example_dynamic_shared_memory(const T* input, T* output) { // Indexing for this block unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; @@ -237,24 +211,20 @@ void example_dynamic_shared_memory(const T *input, T *output) // execute inclusive scan input_value = input[index]; - block_scan_type() - .inclusive_scan( - input_value, output_value, - *primitive_storage, - rocprim::plus() - ); + block_scan_type().inclusive_scan( + input_value, output_value, *primitive_storage, rocprim::plus()); output[index] = output_value; } // Host function that runs example_dynamic_shared_memory kernel -template +template void run_example_dynamic_shared_memory(size_t size) { constexpr unsigned int block_size = 256; // Make sure size is a multiple of block_size auto grid_size = (size + block_size - 1) / block_size; - size = block_size * grid_size; + size = block_size * grid_size; // Generate input on host and copy it to device std::vector host_input = get_random_data(size, 0, 1000); @@ -264,29 +234,30 @@ void run_example_dynamic_shared_memory(size_t size) std::vector host_output(size); // Device memory allocation - T * device_input; - T * device_output; - HIP_CHECK(hipMalloc(&device_input, host_input.size() * sizeof(typename decltype(host_input)::value_type))); - HIP_CHECK(hipMalloc(&device_output, host_output.size() * sizeof(typename decltype(host_output)::value_type))); + T* device_input; + T* device_output; + HIP_CHECK(hipMalloc(&device_input, + host_input.size() * sizeof(typename decltype(host_input)::value_type))); + HIP_CHECK(hipMalloc(&device_output, + host_output.size() * sizeof(typename decltype(host_output)::value_type))); // Writing input data to device memory hip_write_device_memory(device_input, host_input); // Launching kernel example_shared_memory - hipLaunchKernelGGL( - HIP_KERNEL_NAME(example_dynamic_shared_memory), - dim3(grid_size), dim3(block_size), - sizeof(typename rocprim::block_scan::storage_type), 0, - device_input, device_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(example_dynamic_shared_memory), + dim3(grid_size), + dim3(block_size), + sizeof(typename rocprim::block_scan::storage_type), + 0, + device_input, + device_output); // Reading output from device hip_read_device_memory(host_output, device_output); // Validating output - OUTPUT_VALIDATION_CHECK( - validate_device_output(host_output, host_expected_output) - ); + OUTPUT_VALIDATION_CHECK(validate_device_output(host_output, host_expected_output)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -295,15 +266,11 @@ void run_example_dynamic_shared_memory(size_t size) } // Kernel 4 - Using global memory for storage -template< - const unsigned int BlockSize, - class T -> -__global__ -void example_global_memory_storage( - const T *input, - T *output, - typename rocprim::block_scan::storage_type *global_storage) +template +__global__ void example_global_memory_storage( + const T* input, + T* output, + typename rocprim::block_scan::storage_type* global_storage) { // Indexing for this block unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; @@ -315,24 +282,20 @@ void example_global_memory_storage( // execute inclusive scan input_value = input[index]; - block_scan_type() - .inclusive_scan( - input_value, output_value, - global_storage[hipBlockIdx_x], - rocprim::plus() - ); + block_scan_type().inclusive_scan( + input_value, output_value, global_storage[hipBlockIdx_x], rocprim::plus()); output[index] = output_value; } // Host function that runs example_global_memory_storage kernel -template +template void run_example_global_memory_storage(size_t size) { constexpr unsigned int block_size = 256; // Make sure size is a multiple of block_size auto grid_size = (size + block_size - 1) / block_size; - size = block_size * grid_size; + size = block_size * grid_size; // Generate input on host and copy it to device std::vector host_input = get_random_data(size, 0, 1000); @@ -342,34 +305,36 @@ void run_example_global_memory_storage(size_t size) std::vector host_output(size); // Device memory allocation - T * device_input; - T * device_output; - HIP_CHECK(hipMalloc(&device_input, host_input.size() * sizeof(typename decltype(host_input)::value_type))); - HIP_CHECK(hipMalloc(&device_output, host_output.size() * sizeof(typename decltype(host_output)::value_type))); + T* device_input; + T* device_output; + HIP_CHECK(hipMalloc(&device_input, + host_input.size() * sizeof(typename decltype(host_input)::value_type))); + HIP_CHECK(hipMalloc(&device_output, + host_output.size() * sizeof(typename decltype(host_output)::value_type))); // Writing input data to device memory hip_write_device_memory(device_input, host_input); // Allocating temporary storage in global memory using storage_type = typename rocprim::block_scan::storage_type; - storage_type *global_storage; + storage_type* global_storage; HIP_CHECK(hipMalloc(&global_storage, (grid_size * sizeof(storage_type)))); // Launching kernel example_shared_memory - hipLaunchKernelGGL( - HIP_KERNEL_NAME(example_global_memory_storage), - dim3(grid_size), dim3(block_size), - 0, 0, - device_input, device_output, global_storage - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(example_global_memory_storage), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + global_storage); // Reading output from device hip_read_device_memory(host_output, device_output); // Validating output - OUTPUT_VALIDATION_CHECK( - validate_device_output(host_output, host_expected_output) - ); + OUTPUT_VALIDATION_CHECK(validate_device_output(host_output, host_expected_output)); HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); @@ -385,10 +350,10 @@ int main() HIP_CHECK(hipGetDeviceProperties(&device_properties, 0)); // Show device info - printf("Selected device: %s \n", device_properties.name ); - printf("Available global memory: %lu \n", device_properties.totalGlobalMem ); - printf("Shared memory per block: %lu \n", device_properties.sharedMemPerBlock ); - printf("Warp size: %d \n", device_properties.warpSize ); + printf("Selected device: %s \n", device_properties.name); + printf("Available global memory: %lu \n", device_properties.totalGlobalMem); + printf("Shared memory per block: %lu \n", device_properties.sharedMemPerBlock); + printf("Warp size: %d \n", device_properties.warpSize); printf("Max threads per block: %d \n", device_properties.maxThreadsPerBlock); // Running kernels diff --git a/example/example_utils.hpp b/example/example_utils.hpp index d0d280ca1..3d899526b 100644 --- a/example/example_utils.hpp +++ b/example/example_utils.hpp @@ -22,68 +22,64 @@ #define ROCPRIM_EXAMPLE_UTILS_HPP_ #include -#include #include #include +#include #include -#define HIP_CHECK(condition) \ -{ \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ -} +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } -#define OUTPUT_VALIDATION_CHECK(validation_result) \ - { \ - if ( validation_result == false ) \ - { \ - std::cout << "Output validation failed!" << std::endl; \ - return; \ - } \ - } +#define OUTPUT_VALIDATION_CHECK(validation_result) \ + { \ + if(validation_result == false) \ + { \ + std::cout << "Output validation failed!" << std::endl; \ + return; \ + } \ + } -template -inline auto get_random_data(size_t size, T min, T max) - -> typename std::enable_if::value, std::vector>::type +template +inline auto get_random_data(size_t size, T min, T max) -> + typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); + std::random_device rd; + std::default_random_engine gen(rd()); std::uniform_int_distribution distribution(min, max); - std::vector data(size); + std::vector data(size); std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); return data; } -template -inline void hip_read_device_memory(std::vector &host_destination, T *device_source) +template +inline void hip_read_device_memory(std::vector& host_destination, T* device_source) { - HIP_CHECK( - hipMemcpy( - host_destination.data(), device_source, - host_destination.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(host_destination.data(), + device_source, + host_destination.size() * sizeof(T), + hipMemcpyDeviceToHost)); } -template -inline void hip_write_device_memory(T *device_destination, std::vector& host_source) +template +inline void hip_write_device_memory(T* device_destination, std::vector& host_source) { - HIP_CHECK( - hipMemcpy( - device_destination, host_source.data(), - host_source.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_destination, + host_source.data(), + host_source.size() * sizeof(T), + hipMemcpyHostToDevice)); } -template -inline bool validate_device_output(const std::vector &host_output, const std::vector &expected_output) +template +inline bool validate_device_output(const std::vector& host_output, + const std::vector& expected_output) { for(unsigned int index = 0; index < host_output.size(); index++) { @@ -96,20 +92,20 @@ inline bool validate_device_output(const std::vector &host_output, const std: } // Generating expected output for block scan when using rocprim::plus as function -template -std::vector get_expected_output( - const std::vector &host_input, - const unsigned int block_size, - const unsigned int items_per_thread = 1) +template +std::vector get_expected_output(const std::vector& host_input, + const unsigned int block_size, + const unsigned int items_per_thread = 1) { - unsigned int grid_size = host_input.size() / block_size; + unsigned int grid_size = host_input.size() / block_size; std::vector host_expected_output(host_input.size()); for(unsigned int block_index = 0; block_index < (grid_size / items_per_thread); block_index++) { host_expected_output[block_index * block_size] = host_input[block_index * block_size]; - for(unsigned int thread_index = 1; thread_index < (block_size * items_per_thread); thread_index++) + for(unsigned int thread_index = 1; thread_index < (block_size * items_per_thread); + thread_index++) { - int index = block_index * block_size + thread_index; + int index = block_index * block_size + thread_index; host_expected_output[index] = host_expected_output[index - 1] + host_input[index]; } } diff --git a/rocprim/include/rocprim/block/block_discontinuity.hpp b/rocprim/include/rocprim/block/block_discontinuity.hpp index d3af712f8..3f8871140 100644 --- a/rocprim/include/rocprim/block/block_discontinuity.hpp +++ b/rocprim/include/rocprim/block/block_discontinuity.hpp @@ -27,10 +27,10 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" -#include "../types.hpp" +#include "../intrinsics.hpp" #include "../type_traits.hpp" +#include "../types.hpp" /// \addtogroup blockmodule /// @{ @@ -40,40 +40,39 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Trait checks if FlagOp can be called with 3 arguments (a, b, b_index) -template -struct with_b_index_arg - : std::false_type -{ }; - -template -struct with_b_index_arg< - T, FlagOp, - typename std::conditional< - true, - void, - decltype(std::declval()(std::declval(), std::declval(), 0)) - >::type - > : std::true_type -{ }; - -// Wrapping function that allows to call FlagOp of any of these signatures: -// with b_index (a, b, b_index) or without it (a, b). -template -ROCPRIM_DEVICE inline -typename std::enable_if::value, bool>::type -apply(FlagOp flag_op, const T& a, const T& b, unsigned int b_index) -{ - return flag_op(a, b, b_index); -} + // Trait checks if FlagOp can be called with 3 arguments (a, b, b_index) + template + struct with_b_index_arg : std::false_type + { + }; -template -ROCPRIM_DEVICE inline -typename std::enable_if::value, bool>::type -apply(FlagOp flag_op, const T& a, const T& b, unsigned int) -{ - return flag_op(a, b); -} + template + struct with_b_index_arg< + T, + FlagOp, + typename std::conditional()( + std::declval(), std::declval(), 0))>::type> + : std::true_type + { + }; + + // Wrapping function that allows to call FlagOp of any of these signatures: + // with b_index (a, b, b_index) or without it (a, b). + template + ROCPRIM_DEVICE inline typename std::enable_if::value, bool>::type + apply(FlagOp flag_op, const T& a, const T& b, unsigned int b_index) + { + return flag_op(a, b, b_index); + } + + template + ROCPRIM_DEVICE inline typename std::enable_if::value, bool>::type + apply(FlagOp flag_op, const T& a, const T& b, unsigned int) + { + return flag_op(a, b); + } } // end namespace detail @@ -116,10 +115,7 @@ apply(FlagOp flag_op, const T& a, const T& b, unsigned int) /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSize -> +template class block_discontinuity { // Struct used for creating a raw_storage object for this primitive's temporary storage. @@ -130,20 +126,19 @@ class block_discontinuity }; public: - - /// \brief Struct used to allocate a temporary memory that is required for thread - /// communication during operations provided by related parallel primitive. - /// - /// Depending on the implemention the operations exposed by parallel primitive may - /// require a temporary storage for thread communication. The storage should be allocated - /// using keywords __shared__. It can be aliased to - /// an externally allocated memory, or be a part of a union type with other storage types - /// to increase shared memory reusability. - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen +/// \brief Struct used to allocate a temporary memory that is required for thread +/// communication during operations provided by related parallel primitive. +/// +/// Depending on the implemention the operations exposed by parallel primitive may +/// require a temporary storage for thread communication. The storage should be allocated +/// using keywords __shared__. It can be aliased to +/// an externally allocated memory, or be a part of a union type with other storage types +/// to increase shared memory reusability. +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = detail::raw_storage; - #else +#else using storage_type = storage_type_; - #endif +#endif /// \brief Tags \p head_flags that indicate discontinuities between items partitioned /// across the thread block, where the first item has no reference and is always @@ -186,17 +181,19 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_heads(Flag (&head_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_heads(Flag (&head_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { - flag_impl( - head_flags, /* ignored: */ input[0], /* ignored: */ head_flags, /* ignored: */ input[0], - input, flag_op, storage - ); + flag_impl(head_flags, + /* ignored: */ input[0], + /* ignored: */ head_flags, + /* ignored: */ input[0], + input, + flag_op, + storage); } /// \brief Tags \p head_flags that indicate discontinuities between items partitioned @@ -215,11 +212,10 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_heads(Flag (&head_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_heads(Flag (&head_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; flag_heads(head_flags, input, flag_op, storage); @@ -274,18 +270,20 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_heads(Flag (&head_flags)[ItemsPerThread], - T tile_predecessor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_heads(Flag (&head_flags)[ItemsPerThread], + T tile_predecessor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { - flag_impl( - head_flags, tile_predecessor_item, /* ignored: */ head_flags, /* ignored: */ input[0], - input, flag_op, storage - ); + flag_impl(head_flags, + tile_predecessor_item, + /* ignored: */ head_flags, + /* ignored: */ input[0], + input, + flag_op, + storage); } /// \brief Tags \p head_flags that indicate discontinuities between items partitioned @@ -306,12 +304,11 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_heads(Flag (&head_flags)[ItemsPerThread], - T tile_predecessor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_heads(Flag (&head_flags)[ItemsPerThread], + T tile_predecessor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage); @@ -358,17 +355,20 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_tails(Flag (&tail_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_tails(Flag (&tail_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { flag_impl( - /* ignored: */ tail_flags, /* ignored: */ input[0], tail_flags, /* ignored: */ input[0], - input, flag_op, storage - ); + /* ignored: */ tail_flags, + /* ignored: */ input[0], + tail_flags, + /* ignored: */ input[0], + input, + flag_op, + storage); } /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned @@ -387,11 +387,10 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_tails(Flag (&tail_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_tails(Flag (&tail_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; flag_tails(tail_flags, input, flag_op, storage); @@ -446,18 +445,21 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_tails(Flag (&tail_flags)[ItemsPerThread], - T tile_successor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_tails(Flag (&tail_flags)[ItemsPerThread], + T tile_successor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { flag_impl( - /* ignored: */ tail_flags, /* ignored: */ input[0], tail_flags, tile_successor_item, - input, flag_op, storage - ); + /* ignored: */ tail_flags, + /* ignored: */ input[0], + tail_flags, + tile_successor_item, + input, + flag_op, + storage); } /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned @@ -478,12 +480,11 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_tails(Flag (&tail_flags)[ItemsPerThread], - T tile_successor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_tails(Flag (&tail_flags)[ItemsPerThread], + T tile_successor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; flag_tails(tail_flags, tile_successor_item, input, flag_op, storage); @@ -532,18 +533,20 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - Flag (&tail_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + Flag (&tail_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { - flag_impl( - head_flags, /* ignored: */ input[0], tail_flags, /* ignored: */ input[0], - input, flag_op, storage - ); + flag_impl(head_flags, + /* ignored: */ input[0], + tail_flags, + /* ignored: */ input[0], + input, + flag_op, + storage); } /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities @@ -562,12 +565,11 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - Flag (&tail_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + Flag (&tail_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage); @@ -625,19 +627,21 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - Flag (&tail_flags)[ItemsPerThread], - T tile_successor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + Flag (&tail_flags)[ItemsPerThread], + T tile_successor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { - flag_impl( - head_flags, /* ignored: */ input[0], tail_flags, tile_successor_item, - input, flag_op, storage - ); + flag_impl(head_flags, + /* ignored: */ input[0], + tail_flags, + tile_successor_item, + input, + flag_op, + storage); } /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities @@ -659,13 +663,12 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - Flag (&tail_flags)[ItemsPerThread], - T tile_successor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + Flag (&tail_flags)[ItemsPerThread], + T tile_successor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage); @@ -723,19 +726,21 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - T tile_predecessor_item, - Flag (&tail_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + T tile_predecessor_item, + Flag (&tail_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { - flag_impl( - head_flags, tile_predecessor_item, tail_flags, /* ignored: */ input[0], - input, flag_op, storage - ); + flag_impl(head_flags, + tile_predecessor_item, + tail_flags, + /* ignored: */ input[0], + input, + flag_op, + storage); } /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities @@ -757,16 +762,16 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - T tile_predecessor_item, - Flag (&tail_flags)[ItemsPerThread], - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + T tile_predecessor_item, + Flag (&tail_flags)[ItemsPerThread], + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; - flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage); + flag_heads_and_tails( + head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage); } /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities @@ -827,20 +832,22 @@ class block_discontinuity /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - T tile_predecessor_item, - Flag (&tail_flags)[ItemsPerThread], - T tile_successor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + T tile_predecessor_item, + Flag (&tail_flags)[ItemsPerThread], + T tile_successor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { - flag_impl( - head_flags, tile_predecessor_item, tail_flags, tile_successor_item, - input, flag_op, storage - ); + flag_impl(head_flags, + tile_predecessor_item, + tail_flags, + tile_successor_item, + input, + flag_op, + storage); } /// \brief Tags both \p head_flags and \p tail_flags that indicate discontinuities @@ -865,46 +872,44 @@ class block_discontinuity /// bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index);. /// The signature does not need to have const &, but function object /// must not modify the objects passed to it. - template - ROCPRIM_DEVICE inline - void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], - T tile_predecessor_item, - Flag (&tail_flags)[ItemsPerThread], - T tile_successor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op) + template + ROCPRIM_DEVICE inline void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread], + T tile_predecessor_item, + Flag (&tail_flags)[ItemsPerThread], + T tile_successor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op) { ROCPRIM_SHARED_MEMORY storage_type storage; - flag_heads_and_tails( - head_flags, tile_predecessor_item, tail_flags, tile_successor_item, - input, flag_op, storage - ); + flag_heads_and_tails(head_flags, + tile_predecessor_item, + tail_flags, + tile_successor_item, + input, + flag_op, + storage); } private: - - template< - bool WithHeads, - bool WithTilePredecessor, - bool WithTails, - bool WithTileSuccessor, - unsigned int ItemsPerThread, - class Flag, - class FlagOp - > - ROCPRIM_DEVICE inline - void flag_impl(Flag (&head_flags)[ItemsPerThread], - T tile_predecessor_item, - Flag (&tail_flags)[ItemsPerThread], - T tile_successor_item, - const T (&input)[ItemsPerThread], - FlagOp flag_op, - storage_type& storage) + template + ROCPRIM_DEVICE inline void flag_impl(Flag (&head_flags)[ItemsPerThread], + T tile_predecessor_item, + Flag (&tail_flags)[ItemsPerThread], + T tile_successor_item, + const T (&input)[ItemsPerThread], + FlagOp flag_op, + storage_type& storage) { static_assert(::rocprim::is_integral::value, "Flag must be integral type"); - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); // Copy input items for rare cases when input and head_flags/tail_flags are the same arrays // (in other cases it does not affect performance) T items[ItemsPerThread]; @@ -932,7 +937,8 @@ class block_discontinuity { predecessor_item = storage_.last_items[flat_id - 1]; } - head_flags[0] = detail::apply(flag_op, predecessor_item, items[0], flat_id * ItemsPerThread); + head_flags[0] + = detail::apply(flag_op, predecessor_item, items[0], flat_id * ItemsPerThread); } else { @@ -940,22 +946,25 @@ class block_discontinuity head_flags[0] = true; if(flat_id != 0) { - head_flags[0] = detail::apply( - flag_op, storage_.last_items[flat_id - 1], items[0], flat_id * ItemsPerThread - ); + head_flags[0] = detail::apply(flag_op, + storage_.last_items[flat_id - 1], + items[0], + flat_id * ItemsPerThread); } } for(unsigned int i = 1; i < ItemsPerThread; i++) { - head_flags[i] = detail::apply(flag_op, items[i - 1], items[i], flat_id * ItemsPerThread + i); + head_flags[i] + = detail::apply(flag_op, items[i - 1], items[i], flat_id * ItemsPerThread + i); } } if(WithTails) { for(unsigned int i = 0; i < ItemsPerThread - 1; i++) { - tail_flags[i] = detail::apply(flag_op, items[i], items[i + 1], flat_id * ItemsPerThread + i + 1); + tail_flags[i] = detail::apply( + flag_op, items[i], items[i + 1], flat_id * ItemsPerThread + i + 1); } if(WithTileSuccessor) @@ -965,10 +974,11 @@ class block_discontinuity { successor_item = storage_.first_items[flat_id + 1]; } - tail_flags[ItemsPerThread - 1] = detail::apply( - flag_op, items[ItemsPerThread - 1], successor_item, - flat_id * ItemsPerThread + ItemsPerThread - ); + tail_flags[ItemsPerThread - 1] + = detail::apply(flag_op, + items[ItemsPerThread - 1], + successor_item, + flat_id * ItemsPerThread + ItemsPerThread); } else { @@ -976,10 +986,11 @@ class block_discontinuity tail_flags[ItemsPerThread - 1] = true; if(flat_id != BlockSize - 1) { - tail_flags[ItemsPerThread - 1] = detail::apply( - flag_op, items[ItemsPerThread - 1], storage_.first_items[flat_id + 1], - flat_id * ItemsPerThread + ItemsPerThread - ); + tail_flags[ItemsPerThread - 1] + = detail::apply(flag_op, + items[ItemsPerThread - 1], + storage_.first_items[flat_id + 1], + flat_id * ItemsPerThread + ItemsPerThread); } } } diff --git a/rocprim/include/rocprim/block/block_exchange.hpp b/rocprim/include/rocprim/block/block_exchange.hpp index fe9fa06c7..e2a13098a 100644 --- a/rocprim/include/rocprim/block/block_exchange.hpp +++ b/rocprim/include/rocprim/block/block_exchange.hpp @@ -24,8 +24,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" /// \addtogroup blockmodule @@ -71,27 +71,23 @@ BEGIN_ROCPRIM_NAMESPACE /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template class block_exchange { // Select warp size - static constexpr unsigned int warp_size = - detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); + static constexpr unsigned int warp_size + = detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); // Number of warps in block static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size; // Minimize LDS bank conflicts for power-of-two strides, i.e. when items accessed // using `thread_id * ItemsPerThread` pattern where ItemsPerThread is power of two // (all exchanges from/to blocked). - static constexpr bool has_bank_conflicts = - ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread); + static constexpr bool has_bank_conflicts + = ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread); static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no(); - static constexpr unsigned int bank_conflicts_padding = - has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0; + static constexpr unsigned int bank_conflicts_padding + = has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0; // Struct used for creating a raw_storage object for this primitive's temporary storage. struct storage_type_ @@ -100,20 +96,19 @@ class block_exchange }; public: - - /// \brief Struct used to allocate a temporary memory that is required for thread - /// communication during operations provided by related parallel primitive. - /// - /// Depending on the implemention the operations exposed by parallel primitive may - /// require a temporary storage for thread communication. The storage should be allocated - /// using keywords __shared__. It can be aliased to - /// an externally allocated memory, or be a part of a union type with other storage types - /// to increase shared memory reusability. - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen +/// \brief Struct used to allocate a temporary memory that is required for thread +/// communication during operations provided by related parallel primitive. +/// +/// Depending on the implemention the operations exposed by parallel primitive may +/// require a temporary storage for thread communication. The storage should be allocated +/// using keywords __shared__. It can be aliased to +/// an externally allocated memory, or be a part of a union type with other storage types +/// to increase shared memory reusability. +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = detail::raw_storage; - #else +#else using storage_type = storage_type_; // only for Doxygen - #endif +#endif /// \brief Transposes a blocked arrangement of items to a striped arrangement /// across the thread block. @@ -122,10 +117,9 @@ class block_exchange /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. - template - ROCPRIM_DEVICE inline - void blocked_to_striped(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void blocked_to_striped(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; blocked_to_striped(input, output, storage); @@ -160,14 +154,13 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void blocked_to_striped(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void blocked_to_striped(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + storage_type& storage) { - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { @@ -188,10 +181,9 @@ class block_exchange /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. - template - ROCPRIM_DEVICE inline - void striped_to_blocked(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void striped_to_blocked(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; striped_to_blocked(input, output, storage); @@ -226,14 +218,13 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void striped_to_blocked(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void striped_to_blocked(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + storage_type& storage) { - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { @@ -254,10 +245,9 @@ class block_exchange /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. - template - ROCPRIM_DEVICE inline - void blocked_to_warp_striped(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void blocked_to_warp_striped(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; blocked_to_warp_striped(input, output, storage); @@ -292,18 +282,17 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void blocked_to_warp_striped(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void blocked_to_warp_striped(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + storage_type& storage) { - constexpr unsigned int items_per_warp = warp_size * ItemsPerThread; - const unsigned int lane_id = ::rocprim::lane_id(); - const unsigned int warp_id = ::rocprim::warp_id(); - const unsigned int current_warp_size = get_current_warp_size(); - const unsigned int offset = warp_id * items_per_warp; - storage_type_& storage_ = storage.get(); + constexpr unsigned int items_per_warp = warp_size * ItemsPerThread; + const unsigned int lane_id = ::rocprim::lane_id(); + const unsigned int warp_id = ::rocprim::warp_id(); + const unsigned int current_warp_size = get_current_warp_size(); + const unsigned int offset = warp_id * items_per_warp; + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { @@ -323,10 +312,9 @@ class block_exchange /// /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. - template - ROCPRIM_DEVICE inline - void warp_striped_to_blocked(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void warp_striped_to_blocked(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; warp_striped_to_blocked(input, output, storage); @@ -361,18 +349,17 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void warp_striped_to_blocked(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void warp_striped_to_blocked(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + storage_type& storage) { - constexpr unsigned int items_per_warp = warp_size * ItemsPerThread; - const unsigned int lane_id = ::rocprim::lane_id(); - const unsigned int warp_id = ::rocprim::warp_id(); - const unsigned int current_warp_size = get_current_warp_size(); - const unsigned int offset = warp_id * items_per_warp; - storage_type_& storage_ = storage.get(); + constexpr unsigned int items_per_warp = warp_size * ItemsPerThread; + const unsigned int lane_id = ::rocprim::lane_id(); + const unsigned int warp_id = ::rocprim::warp_id(); + const unsigned int current_warp_size = get_current_warp_size(); + const unsigned int offset = warp_id * items_per_warp; + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { @@ -394,11 +381,10 @@ class block_exchange /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// \param [out] ranks - array that has rank of data. - template - ROCPRIM_DEVICE inline - void scatter_to_blocked(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void scatter_to_blocked(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; scatter_to_blocked(input, output, ranks, storage); @@ -436,19 +422,18 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void scatter_to_blocked(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void scatter_to_blocked(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread], + storage_type& storage) { - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { - const Offset rank = ranks[i]; + const Offset rank = ranks[i]; storage_.buffer[index(rank)] = input[i]; } ::rocprim::syncthreads(); @@ -468,11 +453,10 @@ class block_exchange /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// \param [out] ranks - array that has rank of data. - template - ROCPRIM_DEVICE inline - void scatter_to_striped(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void scatter_to_striped(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; scatter_to_striped(input, output, ranks, storage); @@ -510,19 +494,18 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void scatter_to_striped(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void scatter_to_striped(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread], + storage_type& storage) { - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { - const Offset rank = ranks[i]; + const Offset rank = ranks[i]; storage_.buffer[rank] = input[i]; } ::rocprim::syncthreads(); @@ -545,11 +528,10 @@ class block_exchange /// \param [in] input - array that data is loaded from. /// \param [out] output - array that data is loaded to. /// \param [in] ranks - array that has rank of data. - template - ROCPRIM_DEVICE inline - void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; scatter_to_striped_guarded(input, output, ranks, storage); @@ -590,15 +572,14 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void scatter_to_striped_guarded(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread], + storage_type& storage) { - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { @@ -627,12 +608,12 @@ class block_exchange /// \param [out] output - array that data is loaded to. /// \param [in] ranks - array that has rank of data. /// \param [in] is_valid - array that has flags to denote validity. - template - ROCPRIM_DEVICE inline - void scatter_to_striped_flagged(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread], - const ValidFlag (&is_valid)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void + scatter_to_striped_flagged(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread], + const ValidFlag (&is_valid)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; scatter_to_striped_flagged(input, output, ranks, is_valid, storage); @@ -674,16 +655,16 @@ class block_exchange /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void scatter_to_striped_flagged(const T (&input)[ItemsPerThread], - U (&output)[ItemsPerThread], - const Offset (&ranks)[ItemsPerThread], - const ValidFlag (&is_valid)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + scatter_to_striped_flagged(const T (&input)[ItemsPerThread], + U (&output)[ItemsPerThread], + const Offset (&ranks)[ItemsPerThread], + const ValidFlag (&is_valid)[ItemsPerThread], + storage_type& storage) { - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); for(unsigned int i = 0; i < ItemsPerThread; i++) { @@ -702,19 +683,16 @@ class block_exchange } private: - - ROCPRIM_DEVICE inline - unsigned int get_current_warp_size() const + ROCPRIM_DEVICE inline unsigned int get_current_warp_size() const { const unsigned int warp_id = ::rocprim::warp_id(); return (warp_id == warps_no - 1) - ? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size) - : warp_size; + ? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size) + : warp_size; } // Change index to minimize LDS bank conflicts if necessary - ROCPRIM_DEVICE inline - unsigned int index(unsigned int n) + ROCPRIM_DEVICE inline unsigned int index(unsigned int n) { // Move every 32-bank wide "row" (32 banks * 4 bytes) by one item return has_bank_conflicts ? (n + n / banks_no) : n; diff --git a/rocprim/include/rocprim/block/block_histogram.hpp b/rocprim/include/rocprim/block/block_histogram.hpp index 9469a3f5f..c0f5b5380 100644 --- a/rocprim/include/rocprim/block/block_histogram.hpp +++ b/rocprim/include/rocprim/block/block_histogram.hpp @@ -26,8 +26,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "detail/block_histogram_atomic.hpp" #include "detail/block_histogram_sort.hpp" @@ -62,24 +62,24 @@ enum class block_histogram_algorithm namespace detail { -// Selector for block_histogram algorithm which gives block histogram implementation -// type based on passed block_histogram_algorithm enum -template -struct select_block_histogram_impl; + // Selector for block_histogram algorithm which gives block histogram implementation + // type based on passed block_histogram_algorithm enum + template + struct select_block_histogram_impl; -template<> -struct select_block_histogram_impl -{ - template - using type = block_histogram_atomic; -}; + template <> + struct select_block_histogram_impl + { + template + using type = block_histogram_atomic; + }; -template<> -struct select_block_histogram_impl -{ - template - using type = block_histogram_sort; -}; + template <> + struct select_block_histogram_impl + { + template + using type = block_histogram_sort; + }; } // end namespace detail @@ -123,19 +123,20 @@ struct select_block_histogram_impl /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Bins, - block_histogram_algorithm Algorithm = block_histogram_algorithm::default_algorithm -> +template class block_histogram #ifndef DOXYGEN_SHOULD_SKIP_THIS - : private detail::select_block_histogram_impl::template type + : private detail::select_block_histogram_impl< + Algorithm>::template type #endif { - using base_type = typename detail::select_block_histogram_impl::template type; + using base_type = typename detail::select_block_histogram_impl< + Algorithm>::template type; + public: /// \brief Struct used to allocate a temporary memory that is required for thread /// communication during operations provided by related parallel primitive. @@ -152,13 +153,12 @@ class block_histogram /// \tparam Counter - [inferred] counter type of histogram. /// /// \param [out] hist - histogram bin count. - template - ROCPRIM_DEVICE inline - void init_histogram(Counter hist[Bins]) + template + ROCPRIM_DEVICE inline void init_histogram(Counter hist[Bins]) { const auto flat_tid = ::rocprim::flat_block_thread_id(); - #pragma unroll +#pragma unroll for(unsigned int offset = 0; offset < Bins; offset += BlockSize) { const unsigned int offset_tid = offset + flat_tid; @@ -216,11 +216,9 @@ class block_histogram /// } /// \endcode /// \endparblock - template - ROCPRIM_DEVICE inline - void composite(T (&input)[ItemsPerThread], - Counter hist[Bins], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + composite(T (&input)[ItemsPerThread], Counter hist[Bins], storage_type& storage) { base_type::composite(input, hist, storage); } @@ -236,10 +234,8 @@ class block_histogram /// /// \param [in] input - reference to an array containing thread input values. /// \param [out] hist - histogram bin count. - template - ROCPRIM_DEVICE inline - void composite(T (&input)[ItemsPerThread], - Counter hist[Bins]) + template + ROCPRIM_DEVICE inline void composite(T (&input)[ItemsPerThread], Counter hist[Bins]) { base_type::composite(input, hist); } @@ -284,11 +280,9 @@ class block_histogram /// } /// \endcode /// \endparblock - template - ROCPRIM_DEVICE inline - void histogram(T (&input)[ItemsPerThread], - Counter hist[Bins], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + histogram(T (&input)[ItemsPerThread], Counter hist[Bins], storage_type& storage) { init_histogram(hist); ::rocprim::syncthreads(); @@ -306,10 +300,8 @@ class block_histogram /// /// \param [in] input - reference to an array containing thread input values. /// \param [out] hist - histogram bin count. - template - ROCPRIM_DEVICE inline - void histogram(T (&input)[ItemsPerThread], - Counter hist[Bins]) + template + ROCPRIM_DEVICE inline void histogram(T (&input)[ItemsPerThread], Counter hist[Bins]) { init_histogram(hist); ::rocprim::syncthreads(); diff --git a/rocprim/include/rocprim/block/block_load.hpp b/rocprim/include/rocprim/block/block_load.hpp index cc560f804..b26583f6b 100644 --- a/rocprim/include/rocprim/block/block_load.hpp +++ b/rocprim/include/rocprim/block/block_load.hpp @@ -24,12 +24,12 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" -#include "block_load_func.hpp" #include "block_exchange.hpp" +#include "block_load_func.hpp" /// \addtogroup blockmodule /// @{ @@ -118,31 +118,29 @@ enum class block_load_method /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - block_load_method Method = block_load_method::block_load_direct -> +template class block_load { private: using storage_type_ = typename ::rocprim::detail::empty_storage_type; public: - /// \brief Struct used to allocate a temporary memory that is required for thread - /// communication during operations provided by related parallel primitive. - /// - /// Depending on the implemention the operations exposed by parallel primitive may - /// require a temporary storage for thread communication. The storage should be allocated - /// using keywords \p __shared__. It can be aliased to - /// an externally allocated memory, or be a part of a union with other storage types - /// to increase shared memory reusability. - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen +/// \brief Struct used to allocate a temporary memory that is required for thread +/// communication during operations provided by related parallel primitive. +/// +/// Depending on the implemention the operations exposed by parallel primitive may +/// require a temporary storage for thread communication. The storage should be allocated +/// using keywords \p __shared__. It can be aliased to +/// an externally allocated memory, or be a part of a union with other storage types +/// to increase shared memory reusability. +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = typename ::rocprim::detail::empty_storage_type; - #else +#else using storage_type = storage_type_; // only for Doxygen - #endif +#endif /// \brief Loads data from continuous memory into an arrangement of items across the /// thread block. @@ -156,10 +154,8 @@ class block_load /// \par Overview /// * The type \p T must be such that an object of type \p InputIterator /// can be dereferenced and then implicitly converted to \p T. - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, T (&items)[ItemsPerThread]) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -182,11 +178,9 @@ class block_load /// \par Overview /// * The type \p T must be such that an object of type \p InputIterator /// can be dereferenced and then implicitly converted to \p T. - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -212,23 +206,18 @@ class block_load /// \par Overview /// * The type \p T must be such that an object of type \p InputIterator /// can be dereferenced and then implicitly converted to \p T. - template< - class InputIterator, - class Default - > - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - block_load_direct_blocked(flat_id, block_input, items, valid, - out_of_bounds); + block_load_direct_blocked(flat_id, block_input, items, valid, out_of_bounds); } /// \brief Loads data from continuous memory into an arrangement of items across the @@ -261,17 +250,15 @@ class block_load /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, T (&items)[ItemsPerThread], storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); - (void) storage; + (void)storage; load(block_input, items); } @@ -306,18 +293,17 @@ class block_load /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); - (void) storage; + (void)storage; load(block_input, items, valid); } @@ -355,22 +341,18 @@ class block_load /// ... /// } /// \endcode - template< - class InputIterator, - class Default - > - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); - (void) storage; + (void)storage; load(block_input, items, valid, out_of_bounds); } }; @@ -380,35 +362,27 @@ class block_load #ifndef DOXYGEN_SHOULD_SKIP_THIS -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template class block_load { private: using storage_type_ = typename ::rocprim::detail::empty_storage_type; public: - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = typename ::rocprim::detail::empty_storage_type; - #else +#else using storage_type = storage_type_; // only for Doxygen - #endif +#endif - ROCPRIM_DEVICE inline - void load(T* block_input, - T (&items)[ItemsPerThread]) + ROCPRIM_DEVICE inline void load(T* block_input, T (&items)[ItemsPerThread]) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_load_direct_blocked_vectorized(flat_id, block_input, items); } - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - U (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, U (&items)[ItemsPerThread]) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -418,11 +392,9 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -432,88 +404,70 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - block_load_direct_blocked(flat_id, block_input, items, valid, - out_of_bounds); + block_load_direct_blocked(flat_id, block_input, items, valid, out_of_bounds); } - ROCPRIM_DEVICE inline - void load(T* block_input, - T (&items)[ItemsPerThread], - storage_type& storage) + ROCPRIM_DEVICE inline void + load(T* block_input, T (&items)[ItemsPerThread], storage_type& storage) { - (void) storage; + (void)storage; load(block_input, items); } - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - U (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, U (&items)[ItemsPerThread], storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); - (void) storage; + (void)storage; load(block_input, items); } - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); - (void) storage; + (void)storage; load(block_input, items, valid); } - template< - class InputIterator, - class Default - > - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); - (void) storage; + (void)storage; load(block_input, items, valid, out_of_bounds); } }; -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template class block_load { private: @@ -522,63 +476,52 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, T (&items)[ItemsPerThread]) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_load_direct_striped(flat_id, block_input, items); block_exchange_type().striped_to_blocked(items, items, storage); } - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_load_direct_striped(flat_id, block_input, items, valid); block_exchange_type().striped_to_blocked(items, items, storage); } - template< - class InputIterator, - class Default - > - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - block_load_direct_striped(flat_id, block_input, items, valid, - out_of_bounds); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + block_load_direct_striped(flat_id, block_input, items, valid, out_of_bounds); block_exchange_type().striped_to_blocked(items, items, storage); } - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, T (&items)[ItemsPerThread], storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -589,12 +532,11 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -605,33 +547,24 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - block_load_direct_striped(flat_id, block_input, items, valid, - out_of_bounds); + block_load_direct_striped(flat_id, block_input, items, valid, out_of_bounds); block_exchange_type().striped_to_blocked(items, items, storage); } }; -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template class block_load { private: @@ -639,68 +572,56 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, T (&items)[ItemsPerThread]) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_load_direct_warp_striped(flat_id, block_input, items); block_exchange_type().warp_striped_to_blocked(items, items, storage); } - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_load_direct_warp_striped(flat_id, block_input, items, valid); block_exchange_type().warp_striped_to_blocked(items, items, storage); - } - template< - class InputIterator, - class Default - > - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - block_load_direct_warp_striped(flat_id, block_input, items, valid, - out_of_bounds); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + block_load_direct_warp_striped(flat_id, block_input, items, valid, out_of_bounds); block_exchange_type().warp_striped_to_blocked(items, items, storage); } - template - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + load(InputIterator block_input, T (&items)[ItemsPerThread], storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -711,12 +632,11 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, @@ -727,24 +647,19 @@ class block_load - ROCPRIM_DEVICE inline - void load(InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds, - storage_type& storage) + template + ROCPRIM_DEVICE inline void load(InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds, + storage_type& storage) { using value_type = typename std::iterator_traits::value_type; static_assert(std::is_convertible::value, "The type T must be such that an object of type InputIterator " "can be dereferenced and then implicitly converted to T."); const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - block_load_direct_warp_striped(flat_id, block_input, items, valid, - out_of_bounds); + block_load_direct_warp_striped(flat_id, block_input, items, valid, out_of_bounds); block_exchange_type().warp_striped_to_blocked(items, items, storage); } }; diff --git a/rocprim/include/rocprim/block/block_load_func.hpp b/rocprim/include/rocprim/block/block_load_func.hpp index 99fcf4264..469c19c24 100644 --- a/rocprim/include/rocprim/block/block_load_func.hpp +++ b/rocprim/include/rocprim/block/block_load_func.hpp @@ -24,8 +24,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -49,20 +49,15 @@ BEGIN_ROCPRIM_NAMESPACE /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_input - the input iterator from the thread block to load from /// \param items - array that data is loaded to -template< - class InputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_load_direct_blocked(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline void block_load_direct_blocked(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread]) { - unsigned int offset = flat_id * ItemsPerThread; + unsigned int offset = flat_id * ItemsPerThread; InputIterator thread_iter = block_input + offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = thread_iter[item]; } @@ -85,23 +80,18 @@ void block_load_direct_blocked(unsigned int flat_id, /// \param block_input - the input iterator from the thread block to load from /// \param items - array that data is loaded to /// \param valid - maximum range of valid numbers to load -template< - class InputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_load_direct_blocked(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid) +template +ROCPRIM_DEVICE inline void block_load_direct_blocked(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid) { - unsigned int offset = flat_id * ItemsPerThread; + unsigned int offset = flat_id * ItemsPerThread; InputIterator thread_iter = block_input + offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { - if (item + offset < valid) + if(item + offset < valid) { items[item] = thread_iter[item]; } @@ -128,21 +118,15 @@ void block_load_direct_blocked(unsigned int flat_id, /// \param items - array that data is loaded to /// \param valid - maximum range of valid numbers to load /// \param out_of_bounds - default value assigned to out-of-bound items -template< - class InputIterator, - class T, - unsigned int ItemsPerThread, - class Default -> -ROCPRIM_DEVICE inline -void block_load_direct_blocked(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds) +template +ROCPRIM_DEVICE inline void block_load_direct_blocked(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) { - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = out_of_bounds; } @@ -175,47 +159,37 @@ void block_load_direct_blocked(unsigned int flat_id, /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_input - the input iterator from the thread block to load from /// \param items - array that data is loaded to -template< - class T, - class U, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if()>::type -block_load_direct_blocked_vectorized(unsigned int flat_id, - T* block_input, - U (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline typename std::enable_if()>::type + block_load_direct_blocked_vectorized(unsigned int flat_id, + T* block_input, + U (&items)[ItemsPerThread]) { typedef typename detail::match_vector_type::type vector_type; constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type); - vector_type vector_items[vectors_per_thread]; + vector_type vector_items[vectors_per_thread]; - const vector_type* vector_ptr = reinterpret_cast(block_input) + - (flat_id * vectors_per_thread); + const vector_type* vector_ptr + = reinterpret_cast(block_input) + (flat_id * vectors_per_thread); - #pragma unroll - for (unsigned int item = 0; item < vectors_per_thread; item++) +#pragma unroll + for(unsigned int item = 0; item < vectors_per_thread; item++) { vector_items[item] = *(vector_ptr + item); } - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = *(reinterpret_cast(vector_items) + item); } } -template< - class T, - class U, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if()>::type -block_load_direct_blocked_vectorized(unsigned int flat_id, - T* block_input, - U (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline typename std::enable_if()>::type + block_load_direct_blocked_vectorized(unsigned int flat_id, + T* block_input, + U (&items)[ItemsPerThread]) { block_load_direct_blocked(flat_id, block_input, items); } @@ -237,20 +211,14 @@ block_load_direct_blocked_vectorized(unsigned int flat_id, /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_input - the input iterator from the thread block to load from /// \param items - array that data is loaded to -template< - unsigned int BlockSize, - class InputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_load_direct_striped(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline void block_load_direct_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread]) { InputIterator thread_iter = block_input + flat_id; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = thread_iter[item * BlockSize]; } @@ -274,24 +242,18 @@ void block_load_direct_striped(unsigned int flat_id, /// \param block_input - the input iterator from the thread block to load from /// \param items - array that data is loaded to /// \param valid - maximum range of valid numbers to load -template< - unsigned int BlockSize, - class InputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_load_direct_striped(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid) +template +ROCPRIM_DEVICE inline void block_load_direct_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid) { InputIterator thread_iter = block_input + flat_id; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * BlockSize; - if (flat_id + offset < valid) + if(flat_id + offset < valid) { items[item] = thread_iter[offset]; } @@ -319,22 +281,19 @@ void block_load_direct_striped(unsigned int flat_id, /// \param items - array that data is loaded to /// \param valid - maximum range of valid numbers to load /// \param out_of_bounds - default value assigned to out-of-bound items -template< - unsigned int BlockSize, - class InputIterator, - class T, - unsigned int ItemsPerThread, - class Default -> -ROCPRIM_DEVICE inline -void block_load_direct_striped(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds) +template +ROCPRIM_DEVICE inline void block_load_direct_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) { - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = out_of_bounds; } @@ -366,27 +325,24 @@ void block_load_direct_striped(unsigned int flat_id, /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_input - the input iterator from the thread block to load from /// \param items - array that data is loaded to -template< - unsigned int WarpSize = warp_size(), - class InputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_load_direct_warp_striped(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline void block_load_direct_warp_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread]) { static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= warp_size(), - "WarpSize must be a power of two and equal or less" - "than the size of hardware warp."); - unsigned int thread_id = detail::logical_lane_id(); - unsigned int warp_id = flat_id / WarpSize; + "WarpSize must be a power of two and equal or less" + "than the size of hardware warp."); + unsigned int thread_id = detail::logical_lane_id(); + unsigned int warp_id = flat_id / WarpSize; unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; InputIterator thread_iter = block_input + thread_id + warp_offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = thread_iter[item * WarpSize]; } @@ -417,31 +373,28 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// \param block_input - the input iterator from the thread block to load from /// \param items - array that data is loaded to /// \param valid - maximum range of valid numbers to load -template< - unsigned int WarpSize = warp_size(), - class InputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_load_direct_warp_striped(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid) +template +ROCPRIM_DEVICE inline void block_load_direct_warp_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid) { static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= warp_size(), - "WarpSize must be a power of two and equal or less" - "than the size of hardware warp."); - unsigned int thread_id = detail::logical_lane_id(); - unsigned int warp_id = flat_id / WarpSize; + "WarpSize must be a power of two and equal or less" + "than the size of hardware warp."); + unsigned int thread_id = detail::logical_lane_id(); + unsigned int warp_id = flat_id / WarpSize; unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; InputIterator thread_iter = block_input + thread_id + warp_offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * WarpSize; - if (warp_offset + thread_id + offset < valid) + if(warp_offset + thread_id + offset < valid) { items[item] = thread_iter[offset]; } @@ -476,25 +429,22 @@ void block_load_direct_warp_striped(unsigned int flat_id, /// \param items - array that data is loaded to /// \param valid - maximum range of valid numbers to load /// \param out_of_bounds - default value assigned to out-of-bound items -template< - unsigned int WarpSize = warp_size(), - class InputIterator, - class T, - unsigned int ItemsPerThread, - class Default -> -ROCPRIM_DEVICE inline -void block_load_direct_warp_striped(unsigned int flat_id, - InputIterator block_input, - T (&items)[ItemsPerThread], - unsigned int valid, - Default out_of_bounds) +template +ROCPRIM_DEVICE inline void block_load_direct_warp_striped(unsigned int flat_id, + InputIterator block_input, + T (&items)[ItemsPerThread], + unsigned int valid, + Default out_of_bounds) { static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= warp_size(), - "WarpSize must be a power of two and equal or less" - "than the size of hardware warp."); - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) + "WarpSize must be a power of two and equal or less" + "than the size of hardware warp."); +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = out_of_bounds; } diff --git a/rocprim/include/rocprim/block/block_radix_sort.hpp b/rocprim/include/rocprim/block/block_radix_sort.hpp index 5bfdbf98f..a7e208354 100644 --- a/rocprim/include/rocprim/block/block_radix_sort.hpp +++ b/rocprim/include/rocprim/block/block_radix_sort.hpp @@ -24,12 +24,12 @@ #include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/radix_sort.hpp" +#include "../detail/various.hpp" #include "../warp/detail/warp_scan_crosslane.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" #include "block_exchange.hpp" @@ -42,90 +42,89 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -/// Specialized block scan of bool (1 bit values) -/// It uses warp scan and reduce functions of bool (1 bit values) based on ballot and bit count. -/// They have much better performance (several times faster) than generic scan and reduce classes -/// because of using hardware ability to calculate which lanes have true predicate values. -template -class block_bit_plus_scan -{ - // Select warp size - static constexpr unsigned int warp_size = - detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); - // Number of warps in block - static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size; - - // typedef of warp_scan primitive that will be used to get prefix values for - // each warp (scanned carry-outs from warps before it) - // warp_scan_crosslane is an implementation of warp_scan that does not need storage, - // but requires logical warp size to be a power of two. - using warp_scan_prefix_type = - ::rocprim::detail::warp_scan_crosslane; - -public: - - struct storage_type_ + /// Specialized block scan of bool (1 bit values) + /// It uses warp scan and reduce functions of bool (1 bit values) based on ballot and bit count. + /// They have much better performance (several times faster) than generic scan and reduce classes + /// because of using hardware ability to calculate which lanes have true predicate values. + template + class block_bit_plus_scan { - unsigned int warp_prefixes[warps_no]; - // ---------- Shared memory optimisation ---------- - // Since we use warp_scan_crosslane for warp scan, we don't need to allocate - // any temporary memory for it. - }; - - using storage_type = detail::raw_storage; + // Select warp size + static constexpr unsigned int warp_size + = detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); + // Number of warps in block + static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size; + + // typedef of warp_scan primitive that will be used to get prefix values for + // each warp (scanned carry-outs from warps before it) + // warp_scan_crosslane is an implementation of warp_scan that does not need storage, + // but requires logical warp size to be a power of two. + using warp_scan_prefix_type + = ::rocprim::detail::warp_scan_crosslane; + + public: + struct storage_type_ + { + unsigned int warp_prefixes[warps_no]; + // ---------- Shared memory optimisation ---------- + // Since we use warp_scan_crosslane for warp scan, we don't need to allocate + // any temporary memory for it. + }; - template - ROCPRIM_DEVICE inline - void exclusive_scan(const unsigned int (&input)[ItemsPerThread], - unsigned int (&output)[ItemsPerThread], - unsigned int& reduction, - storage_type& storage) - { - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int lane_id = ::rocprim::lane_id(); - const unsigned int warp_id = ::rocprim::warp_id(); - storage_type_& storage_ = storage.get(); + using storage_type = detail::raw_storage; - unsigned int warp_reduction = ::rocprim::bit_count(::rocprim::ballot(input[0])); - for(unsigned int i = 1; i < ItemsPerThread; i++) - { - warp_reduction += ::rocprim::bit_count(::rocprim::ballot(input[i])); - } - if(lane_id == 0) + template + ROCPRIM_DEVICE inline void exclusive_scan(const unsigned int (&input)[ItemsPerThread], + unsigned int (&output)[ItemsPerThread], + unsigned int& reduction, + storage_type& storage) { - storage_.warp_prefixes[warp_id] = warp_reduction; - } - ::rocprim::syncthreads(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int lane_id = ::rocprim::lane_id(); + const unsigned int warp_id = ::rocprim::warp_id(); + storage_type_& storage_ = storage.get(); - // Scan the warp reduction results to calculate warp prefixes - if(flat_id < warps_no) - { - unsigned int prefix = storage_.warp_prefixes[flat_id]; - warp_scan_prefix_type().inclusive_scan(prefix, prefix, ::rocprim::plus()); - storage_.warp_prefixes[flat_id] = prefix; - } - ::rocprim::syncthreads(); + unsigned int warp_reduction = ::rocprim::bit_count(::rocprim::ballot(input[0])); + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + warp_reduction += ::rocprim::bit_count(::rocprim::ballot(input[i])); + } + if(lane_id == 0) + { + storage_.warp_prefixes[warp_id] = warp_reduction; + } + ::rocprim::syncthreads(); - // Perform exclusive warp scan of bit values - unsigned int lane_prefix = 0; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - lane_prefix = ::rocprim::masked_bit_count(::rocprim::ballot(input[i]), lane_prefix); - } + // Scan the warp reduction results to calculate warp prefixes + if(flat_id < warps_no) + { + unsigned int prefix = storage_.warp_prefixes[flat_id]; + warp_scan_prefix_type().inclusive_scan( + prefix, prefix, ::rocprim::plus()); + storage_.warp_prefixes[flat_id] = prefix; + } + ::rocprim::syncthreads(); - // Scan the lane's items and calculate final scan results - output[0] = warp_id == 0 - ? lane_prefix - : lane_prefix + storage_.warp_prefixes[warp_id - 1]; - for(unsigned int i = 1; i < ItemsPerThread; i++) - { - output[i] = output[i - 1] + input[i - 1]; - } + // Perform exclusive warp scan of bit values + unsigned int lane_prefix = 0; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + lane_prefix = ::rocprim::masked_bit_count(::rocprim::ballot(input[i]), lane_prefix); + } - // Get the final inclusive reduction result - reduction = storage_.warp_prefixes[warps_no - 1]; - } -}; + // Scan the lane's items and calculate final scan results + output[0] + = warp_id == 0 ? lane_prefix : lane_prefix + storage_.warp_prefixes[warp_id - 1]; + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + output[i] = output[i - 1] + input[i - 1]; + } + + // Get the final inclusive reduction result + reduction = storage_.warp_prefixes[warps_no - 1]; + } + }; } // end namespace detail @@ -175,20 +174,16 @@ class block_bit_plus_scan /// } /// \endcode /// \endparblock -template< - class Key, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class Value = empty_type -> +template class block_radix_sort { static constexpr bool with_values = !std::is_same::value; - using bit_key_type = typename ::rocprim::detail::radix_key_codec::bit_key_type; + using bit_key_type = typename ::rocprim::detail::radix_key_codec::bit_key_type; using bit_block_scan = detail::block_bit_plus_scan; - using bit_keys_exchange_type = ::rocprim::block_exchange; + using bit_keys_exchange_type + = ::rocprim::block_exchange; using values_exchange_type = ::rocprim::block_exchange; // Struct used for creating a raw_storage object for this primitive's temporary storage. @@ -197,26 +192,25 @@ class block_radix_sort union { typename bit_keys_exchange_type::storage_type bit_keys_exchange; - typename values_exchange_type::storage_type values_exchange; + typename values_exchange_type::storage_type values_exchange; }; typename bit_block_scan::storage_type bit_block_scan; }; public: - - /// \brief Struct used to allocate a temporary memory that is required for thread - /// communication during operations provided by related parallel primitive. - /// - /// Depending on the implemention the operations exposed by parallel primitive may - /// require a temporary storage for thread communication. The storage should be allocated - /// using keywords __shared__. It can be aliased to - /// an externally allocated memory, or be a part of a union type with other storage types - /// to increase shared memory reusability. - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen +/// \brief Struct used to allocate a temporary memory that is required for thread +/// communication during operations provided by related parallel primitive. +/// +/// Depending on the implemention the operations exposed by parallel primitive may +/// require a temporary storage for thread communication. The storage should be allocated +/// using keywords __shared__. It can be aliased to +/// an externally allocated memory, or be a part of a union type with other storage types +/// to increase shared memory reusability. +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = detail::raw_storage; - #else +#else using storage_type = storage_type_; // only for Doxygen - #endif +#endif /// \brief Performs ascending radix sort over keys partitioned across threads in a block. /// @@ -259,11 +253,10 @@ class block_radix_sort /// If the \p input values across threads in a block are {[256, 255], ..., [4, 3], [2, 1]}}, then /// then after sort they will be equal {[1, 2], [3, 4] ..., [255, 256]}. /// \endparblock - ROCPRIM_DEVICE inline - void sort(Key (&keys)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort(Key (&keys)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { empty_type values[ItemsPerThread]; sort_impl(keys, values, storage, begin_bit, end_bit); @@ -281,10 +274,9 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - ROCPRIM_DEVICE inline - void sort(Key (&keys)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort(Key (&keys)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort(keys, storage, begin_bit, end_bit); @@ -331,11 +323,10 @@ class block_radix_sort /// If the \p input values across threads in a block are {[1, 2], [3, 4] ..., [255, 256]}, /// then after sort they will be equal {[256, 255], ..., [4, 3], [2, 1]}. /// \endparblock - ROCPRIM_DEVICE inline - void sort_desc(Key (&keys)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort_desc(Key (&keys)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { empty_type values[ItemsPerThread]; sort_impl(keys, values, storage, begin_bit, end_bit); @@ -353,10 +344,9 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - ROCPRIM_DEVICE inline - void sort_desc(Key (&keys)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort_desc(Key (&keys)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort_desc(keys, storage, begin_bit, end_bit); @@ -411,13 +401,13 @@ class block_radix_sort /// will be equal {[1, 2], [3, 4] ..., [255, 256]} and the \p values will be /// equal {[128, 128], [127, 127] ..., [2, 2], [1, 1]}. /// \endparblock - template - ROCPRIM_DEVICE inline - void sort(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void + sort(Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { sort_impl(keys, values, storage, begin_bit, end_bit); } @@ -438,12 +428,12 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - template - ROCPRIM_DEVICE inline - void sort(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void + sort(Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort(keys, values, storage, begin_bit, end_bit); @@ -498,13 +488,13 @@ class block_radix_sort /// the \p keys will be equal {[256, 255], ..., [4, 3], [2, 1]} and the \p values /// will be equal {[1, 1], [2, 2] ..., [128, 128]}. /// \endparblock - template - ROCPRIM_DEVICE inline - void sort_desc(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void + sort_desc(Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { sort_impl(keys, values, storage, begin_bit, end_bit); } @@ -525,12 +515,12 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - template - ROCPRIM_DEVICE inline - void sort_desc(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void + sort_desc(Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort_desc(keys, values, storage, begin_bit, end_bit); @@ -578,11 +568,10 @@ class block_radix_sort /// If the \p input values across threads in a block are {[256, 255], ..., [4, 3], [2, 1]}}, then /// then after sort they will be equal {[1, 129], [2, 130] ..., [128, 256]}. /// \endparblock - ROCPRIM_DEVICE inline - void sort_to_striped(Key (&keys)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort_to_striped(Key (&keys)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { empty_type values[ItemsPerThread]; sort_impl(keys, values, storage, begin_bit, end_bit); @@ -601,10 +590,9 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - ROCPRIM_DEVICE inline - void sort_to_striped(Key (&keys)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort_to_striped(Key (&keys)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort_to_striped(keys, storage, begin_bit, end_bit); @@ -652,11 +640,10 @@ class block_radix_sort /// If the \p input values across threads in a block are {[1, 2], [3, 4] ..., [255, 256]}, /// then after sort they will be equal {[256, 128], ..., [130, 2], [129, 1]}. /// \endparblock - ROCPRIM_DEVICE inline - void sort_desc_to_striped(Key (&keys)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort_desc_to_striped(Key (&keys)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { empty_type values[ItemsPerThread]; sort_impl(keys, values, storage, begin_bit, end_bit); @@ -675,10 +662,9 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - ROCPRIM_DEVICE inline - void sort_desc_to_striped(Key (&keys)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + ROCPRIM_DEVICE inline void sort_desc_to_striped(Key (&keys)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort_desc_to_striped(keys, storage, begin_bit, end_bit); @@ -733,13 +719,13 @@ class block_radix_sort /// \p keys will be equal {[1, 5], [2, 6], [3, 7], [4, 8]} and the \p values will be /// equal {[-8, -4], [-7, -3], [-6, -2], [-5, -1]}. /// \endparblock - template - ROCPRIM_DEVICE inline - void sort_to_striped(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void + sort_to_striped(Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { sort_impl(keys, values, storage, begin_bit, end_bit); } @@ -758,12 +744,12 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - template - ROCPRIM_DEVICE inline - void sort_to_striped(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void + sort_to_striped(Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort_to_striped(keys, values, storage, begin_bit, end_bit); @@ -818,13 +804,13 @@ class block_radix_sort /// \p keys will be equal {[8, 4], [7, 3], [6, 2], [5, 1]} and the \p values will be /// equal {[10, 50], [20, 60], [30, 70], [40, 80]}. /// \endparblock - template - ROCPRIM_DEVICE inline - void sort_desc_to_striped(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void sort_desc_to_striped( + Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { sort_impl(keys, values, storage, begin_bit, end_bit); } @@ -843,28 +829,26 @@ class block_radix_sort /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in /// key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default /// value: \p 8 * sizeof(Key). - template - ROCPRIM_DEVICE inline - void sort_desc_to_striped(Key (&keys)[ItemsPerThread], - typename std::enable_if::type (&values)[ItemsPerThread], - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key)) + template + ROCPRIM_DEVICE inline void sort_desc_to_striped( + Key (&keys)[ItemsPerThread], + typename std::enable_if::type (&values)[ItemsPerThread], + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key)) { ROCPRIM_SHARED_MEMORY storage_type storage; sort_desc_to_striped(keys, values, storage, begin_bit, end_bit); } private: - - template - ROCPRIM_DEVICE inline - void sort_impl(Key (&keys)[ItemsPerThread], - SortedValue (&values)[ItemsPerThread], - storage_type& storage, - unsigned int begin_bit, - unsigned int end_bit) + template + ROCPRIM_DEVICE inline void sort_impl(Key (&keys)[ItemsPerThread], + SortedValue (&values)[ItemsPerThread], + storage_type& storage, + unsigned int begin_bit, + unsigned int end_bit) { - using key_codec = ::rocprim::detail::radix_key_codec; + using key_codec = ::rocprim::detail::radix_key_codec; storage_type_& storage_ = storage.get(); const unsigned int flat_id = ::rocprim::flat_block_thread_id(); @@ -893,9 +877,8 @@ class block_radix_sort for(unsigned int i = 0; i < ItemsPerThread; i++) { // Calculate position for the first digit (0) value based on positions of the second (1) - ranks[i] = bits[i] != 0 - ? (start + ranks[i]) - : (flat_id * ItemsPerThread + i - ranks[i]); + ranks[i] + = bits[i] != 0 ? (start + ranks[i]) : (flat_id * ItemsPerThread + i - ranks[i]); } exchange_keys(storage, bit_keys, ranks); exchange_values(storage, values, ranks); @@ -913,62 +896,56 @@ class block_radix_sort } } - ROCPRIM_DEVICE inline - void exchange_keys(storage_type& storage, - bit_key_type (&bit_keys)[ItemsPerThread], - const unsigned int (&ranks)[ItemsPerThread]) + ROCPRIM_DEVICE inline void exchange_keys(storage_type& storage, + bit_key_type (&bit_keys)[ItemsPerThread], + const unsigned int (&ranks)[ItemsPerThread]) { storage_type_& storage_ = storage.get(); // Synchronization is omitted here because bit_block_scan already calls it - bit_keys_exchange_type().scatter_to_blocked(bit_keys, bit_keys, ranks, storage_.bit_keys_exchange); + bit_keys_exchange_type().scatter_to_blocked( + bit_keys, bit_keys, ranks, storage_.bit_keys_exchange); } - template - ROCPRIM_DEVICE inline - void exchange_values(storage_type& storage, - SortedValue (&values)[ItemsPerThread], - const unsigned int (&ranks)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void exchange_values(storage_type& storage, + SortedValue (&values)[ItemsPerThread], + const unsigned int (&ranks)[ItemsPerThread]) { storage_type_& storage_ = storage.get(); ::rocprim::syncthreads(); // Storage will be reused (union), synchronization is needed values_exchange_type().scatter_to_blocked(values, values, ranks, storage_.values_exchange); } - ROCPRIM_DEVICE inline - void exchange_values(storage_type& storage, - empty_type (&values)[ItemsPerThread], - const unsigned int (&ranks)[ItemsPerThread]) + ROCPRIM_DEVICE inline void exchange_values(storage_type& storage, + empty_type (&values)[ItemsPerThread], + const unsigned int (&ranks)[ItemsPerThread]) { - (void) storage; - (void) values; - (void) ranks; + (void)storage; + (void)values; + (void)ranks; } - ROCPRIM_DEVICE inline - void to_striped_keys(storage_type& storage, - bit_key_type (&bit_keys)[ItemsPerThread]) + ROCPRIM_DEVICE inline void to_striped_keys(storage_type& storage, + bit_key_type (&bit_keys)[ItemsPerThread]) { storage_type_& storage_ = storage.get(); ::rocprim::syncthreads(); bit_keys_exchange_type().blocked_to_striped(bit_keys, bit_keys, storage_.bit_keys_exchange); } - template - ROCPRIM_DEVICE inline - void to_striped_values(storage_type& storage, - SortedValue (&values)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void to_striped_values(storage_type& storage, + SortedValue (&values)[ItemsPerThread]) { storage_type_& storage_ = storage.get(); ::rocprim::syncthreads(); // Storage will be reused (union), synchronization is needed values_exchange_type().blocked_to_striped(values, values, storage_.values_exchange); } - ROCPRIM_DEVICE inline - void to_striped_values(storage_type& storage, - empty_type * values) + ROCPRIM_DEVICE inline void to_striped_values(storage_type& storage, empty_type* values) { - (void) storage; - (void) values; + (void)storage; + (void)values; } }; diff --git a/rocprim/include/rocprim/block/block_reduce.hpp b/rocprim/include/rocprim/block/block_reduce.hpp index c79156fbf..89b7daafd 100644 --- a/rocprim/include/rocprim/block/block_reduce.hpp +++ b/rocprim/include/rocprim/block/block_reduce.hpp @@ -26,11 +26,11 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" -#include "detail/block_reduce_warp_reduce.hpp" #include "detail/block_reduce_raking_reduce.hpp" +#include "detail/block_reduce_warp_reduce.hpp" /// \addtogroup blockmodule /// @{ @@ -51,24 +51,24 @@ enum class block_reduce_algorithm namespace detail { -// Selector for block_reduce algorithm which gives block reduce implementation -// type based on passed block_reduce_algorithm enum -template -struct select_block_reduce_impl; + // Selector for block_reduce algorithm which gives block reduce implementation + // type based on passed block_reduce_algorithm enum + template + struct select_block_reduce_impl; -template<> -struct select_block_reduce_impl -{ - template - using type = block_reduce_warp_reduce; -}; + template <> + struct select_block_reduce_impl + { + template + using type = block_reduce_warp_reduce; + }; -template<> -struct select_block_reduce_impl -{ - template - using type = block_reduce_raking_reduce; -}; + template <> + struct select_block_reduce_impl + { + template + using type = block_reduce_raking_reduce; + }; } // end namespace detail @@ -115,17 +115,17 @@ struct select_block_reduce_impl /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSize, - block_reduce_algorithm Algorithm = block_reduce_algorithm::default_algorithm -> +template class block_reduce #ifndef DOXYGEN_SHOULD_SKIP_THIS : private detail::select_block_reduce_impl::template type #endif { - using base_type = typename detail::select_block_reduce_impl::template type; + using base_type = + typename detail::select_block_reduce_impl::template type; + public: /// \brief Struct used to allocate a temporary memory that is required for thread /// communication during operations provided by related parallel primitive. @@ -183,12 +183,11 @@ class block_reduce /// If the \p input values across threads in a block are {1, -2, 3, -4, ..., 255, -256}, then /// \p output value will be {-256}. /// \endparblock - template> - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - storage_type& storage, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + storage_type& storage, + BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, storage, reduce_op); } @@ -208,11 +207,9 @@ class block_reduce /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void + reduce(T input, T& output, BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, reduce_op); } @@ -264,15 +261,11 @@ class block_reduce /// If the \p input values across threads in a block are {-1, 2, -3, 4, ..., -255, 256}, then /// \p output value will be {256}. /// \endparblock - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void reduce(T (&input)[ItemsPerThread], - T& output, - storage_type& storage, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void reduce(T (&input)[ItemsPerThread], + T& output, + storage_type& storage, + BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, storage, reduce_op); } @@ -293,14 +286,9 @@ class block_reduce /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void reduce(T (&input)[ItemsPerThread], - T& output, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void + reduce(T (&input)[ItemsPerThread], T& output, BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, reduce_op); } @@ -351,13 +339,12 @@ class block_reduce /// } /// \endcode /// \endparblock - template> - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - unsigned int valid_items, - storage_type& storage, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, valid_items, storage, reduce_op); } @@ -379,12 +366,11 @@ class block_reduce /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - unsigned int valid_items, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + unsigned int valid_items, + BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, valid_items, reduce_op); } diff --git a/rocprim/include/rocprim/block/block_scan.hpp b/rocprim/include/rocprim/block/block_scan.hpp index 6f3102a82..ebe38c2d6 100644 --- a/rocprim/include/rocprim/block/block_scan.hpp +++ b/rocprim/include/rocprim/block/block_scan.hpp @@ -26,11 +26,11 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" -#include "detail/block_scan_warp_scan.hpp" #include "detail/block_scan_reduce_then_scan.hpp" +#include "detail/block_scan_warp_scan.hpp" /// \addtogroup blockmodule /// @{ @@ -51,30 +51,28 @@ enum class block_scan_algorithm namespace detail { -// Selector for block_scan algorithm which gives block scan implementation -// type based on passed block_scan_algorithm enum -template -struct select_block_scan_impl; + // Selector for block_scan algorithm which gives block scan implementation + // type based on passed block_scan_algorithm enum + template + struct select_block_scan_impl; -template<> -struct select_block_scan_impl -{ - template - using type = block_scan_warp_scan; -}; + template <> + struct select_block_scan_impl + { + template + using type = block_scan_warp_scan; + }; -template<> -struct select_block_scan_impl -{ - template - // When BlockSize is less than hardware warp size block_scan_warp_scan performs better than - // block_scan_reduce_then_scan by specializing for warps - using type = typename std::conditional< - (BlockSize <= ::rocprim::warp_size()), - block_scan_warp_scan, - block_scan_reduce_then_scan - >::type; -}; + template <> + struct select_block_scan_impl + { + template + // When BlockSize is less than hardware warp size block_scan_warp_scan performs better than + // block_scan_reduce_then_scan by specializing for warps + using type = typename std::conditional<(BlockSize <= ::rocprim::warp_size()), + block_scan_warp_scan, + block_scan_reduce_then_scan>::type; + }; } // end namespace detail @@ -122,17 +120,17 @@ struct select_block_scan_impl /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSize, - block_scan_algorithm Algorithm = block_scan_algorithm::default_algorithm -> +template class block_scan #ifndef DOXYGEN_SHOULD_SKIP_THIS : private detail::select_block_scan_impl::template type #endif { - using base_type = typename detail::select_block_scan_impl::template type; + using base_type = + typename detail::select_block_scan_impl::template type; + public: /// \brief Struct used to allocate a temporary memory that is required for thread /// communication during operations provided by related parallel primitive. @@ -190,12 +188,11 @@ class block_scan /// If the \p input values across threads in a block are {1, -2, 3, -4, ..., 255, -256}, then /// \p output values in will be {1, -2, -2, -4, ..., -254, -256}. /// \endparblock - template> - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T input, + T& output, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::inclusive_scan(input, output, storage, scan_op); } @@ -215,11 +212,9 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, BinaryFunction scan_op = BinaryFunction()) { base_type::inclusive_scan(input, output, scan_op); } @@ -274,13 +269,12 @@ class block_scan /// \p output values in will be {1, -2, -2, -4, ..., -254, -256}, and the \p reduction will /// be -256. /// \endparblock - template> - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - T& reduction, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T input, + T& output, + T& reduction, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::inclusive_scan(input, output, reduction, storage, scan_op); } @@ -301,12 +295,9 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - T& reduction, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, T& reduction, BinaryFunction scan_op = BinaryFunction()) { base_type::inclusive_scan(input, output, reduction, scan_op); } @@ -387,16 +378,12 @@ class block_scan /// \p output values in will be {11, 12, 13, ..., 266}, and the \p prefix will /// be 266. /// \endparblock - template< - class PrefixCallback, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T input, + T& output, + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { base_type::inclusive_scan(input, output, storage, prefix_callback_op, scan_op); } @@ -448,15 +435,11 @@ class block_scan /// If the \p input values across threads in a block are {-1, 2, -3, 4, ..., -255, 256}, then /// \p output values in will be {-1, 2, 2, 4, ..., 254, 256}. /// \endparblock - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -484,14 +467,10 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -553,16 +532,12 @@ class block_scan /// If the \p input values across threads in a block are {-1, 2, -3, 4, ..., -255, 256}, then /// \p output values in will be {-1, 2, 2, 4, ..., 254, 256} and the \p reduction will be \p 256. /// \endparblock - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& reduction, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& reduction, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -591,15 +566,11 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& reduction, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& reduction, + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -688,17 +659,12 @@ class block_scan /// \p output values in will be {11, 12, 13, ..., 266}, and the \p prefix will /// be 266. /// \endparblock - template< - unsigned int ItemsPerThread, - class PrefixCallback, - class BinaryFunction - > - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { if(ItemsPerThread == 1) { @@ -760,13 +726,12 @@ class block_scan /// If the \p input values across threads in a block are {1, -2, 3, -4, ..., 255, -256} /// and \p init is \p 0, then \p output values in will be {0, 0, -2, -2, -4, ..., -254, -254}. /// \endparblock - template> - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T input, + T& output, + T init, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::exclusive_scan(input, output, init, storage, scan_op); } @@ -788,12 +753,9 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, BinaryFunction scan_op = BinaryFunction()) { base_type::exclusive_scan(input, output, init, scan_op); } @@ -852,14 +814,13 @@ class block_scan /// and \p init is \p 0, then \p output values in will be {0, 0, -2, -2, -4, ..., -254, -254} /// and the \p reduction will be \p -256. /// \endparblock - template> - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T input, + T& output, + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::exclusive_scan(input, output, init, reduction, storage, scan_op); } @@ -882,13 +843,9 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - T& reduction, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, T& reduction, BinaryFunction scan_op = BinaryFunction()) { base_type::exclusive_scan(input, output, init, reduction, scan_op); } @@ -969,16 +926,12 @@ class block_scan /// \p output values in will be {10, 11, 12, 13, ..., 265}, and the \p prefix will /// be 266. /// \endparblock - template< - class PrefixCallback, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T input, + T& output, + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { base_type::exclusive_scan(input, output, storage, prefix_callback_op, scan_op); } @@ -1034,16 +987,12 @@ class block_scan /// If the \p input values across threads in a block are {-1, 2, -3, 4, ..., -255, 256} /// and \p init is 0, then \p output values in will be {0, 0, 2, 2, 4, ..., 254, 254}. /// \endparblock - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -1073,15 +1022,11 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -1148,17 +1093,13 @@ class block_scan /// and \p init is 0, then \p output values in will be {0, 0, 2, 2, 4, ..., 254, 254} /// and the \p reduction will be \p 256. /// \endparblock - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -1189,16 +1130,12 @@ class block_scan /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template< - unsigned int ItemsPerThread, - class BinaryFunction = ::rocprim::plus - > - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - T& reduction, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + T& reduction, + BinaryFunction scan_op = BinaryFunction()) { if(ItemsPerThread == 1) { @@ -1287,17 +1224,12 @@ class block_scan /// \p output values in will be {10, 11, 12, 13, ..., 265}, and the \p prefix will /// be 266. /// \endparblock - template< - unsigned int ItemsPerThread, - class PrefixCallback, - class BinaryFunction - > - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { if(ItemsPerThread == 1) { diff --git a/rocprim/include/rocprim/block/block_sort.hpp b/rocprim/include/rocprim/block/block_sort.hpp index 4cb15c569..34a690d39 100644 --- a/rocprim/include/rocprim/block/block_sort.hpp +++ b/rocprim/include/rocprim/block/block_sort.hpp @@ -26,8 +26,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "detail/block_sort_bitonic.hpp" @@ -48,17 +48,17 @@ enum class block_sort_algorithm namespace detail { -// Selector for block_sort algorithm which gives block sort implementation -// type based on passed block_sort_algorithm enum -template -struct select_block_sort_impl; + // Selector for block_sort algorithm which gives block sort implementation + // type based on passed block_sort_algorithm enum + template + struct select_block_sort_impl; -template<> -struct select_block_sort_impl -{ - template - using type = block_sort_bitonic; -}; + template <> + struct select_block_sort_impl + { + template + using type = block_sort_bitonic; + }; } // end namespace detail @@ -103,18 +103,18 @@ struct select_block_sort_impl /// } /// \endcode /// \endparblock -template< - class Key, - unsigned int BlockSize, - class Value = empty_type, - block_sort_algorithm Algorithm = block_sort_algorithm::default_algorithm -> +template class block_sort #ifndef DOXYGEN_SHOULD_SKIP_THIS : private detail::select_block_sort_impl::template type #endif { - using base_type = typename detail::select_block_sort_impl::template type; + using base_type = + typename detail::select_block_sort_impl::template type; + public: /// \brief Struct used to allocate a temporary memory that is required for thread /// communication during operations provided by related parallel primitive. @@ -137,10 +137,9 @@ class block_sort /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + BinaryFunction compare_function = BinaryFunction()) { base_type::sort(thread_key, compare_function); } @@ -186,11 +185,10 @@ class block_sort /// } /// \endcode /// \endparblock - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - storage_type& storage, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + storage_type& storage, + BinaryFunction compare_function = BinaryFunction()) { base_type::sort(thread_key, storage, compare_function); } @@ -207,11 +205,10 @@ class block_sort /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + BinaryFunction compare_function = BinaryFunction()) { base_type::sort(thread_key, thread_value, compare_function); } @@ -258,12 +255,11 @@ class block_sort /// } /// \endcode /// \endparblock - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - storage_type& storage, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + storage_type& storage, + BinaryFunction compare_function = BinaryFunction()) { base_type::sort(thread_key, thread_value, storage, compare_function); } @@ -282,12 +278,11 @@ class block_sort /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - storage_type& storage, - const unsigned int size, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + storage_type& storage, + const unsigned int size, + BinaryFunction compare_function = BinaryFunction()) { base_type::sort(thread_key, storage, size, compare_function); } @@ -307,13 +302,12 @@ class block_sort /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - storage_type& storage, - const unsigned int size, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + storage_type& storage, + const unsigned int size, + BinaryFunction compare_function = BinaryFunction()) { base_type::sort(thread_key, thread_value, storage, size, compare_function); } diff --git a/rocprim/include/rocprim/block/block_store.hpp b/rocprim/include/rocprim/block/block_store.hpp index f28526b1f..804afe877 100644 --- a/rocprim/include/rocprim/block/block_store.hpp +++ b/rocprim/include/rocprim/block/block_store.hpp @@ -24,12 +24,12 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" -#include "block_store_func.hpp" #include "block_exchange.hpp" +#include "block_store_func.hpp" /// \addtogroup blockmodule /// @{ @@ -118,31 +118,29 @@ enum class block_store_method /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - block_store_method Method = block_store_method::block_store_direct -> +template class block_store { private: using storage_type_ = typename ::rocprim::detail::empty_storage_type; public: - /// \brief Struct used to allocate a temporary memory that is required for thread - /// communication during operations provided by related parallel primitive. - /// - /// Depending on the implemention the operations exposed by parallel primitive may - /// require a temporary storage for thread communication. The storage should be allocated - /// using keywords \p __shared__. It can be aliased to - /// an externally allocated memory, or be a part of a union with other storage types - /// to increase shared memory reusability. - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen +/// \brief Struct used to allocate a temporary memory that is required for thread +/// communication during operations provided by related parallel primitive. +/// +/// Depending on the implemention the operations exposed by parallel primitive may +/// require a temporary storage for thread communication. The storage should be allocated +/// using keywords \p __shared__. It can be aliased to +/// an externally allocated memory, or be a part of a union with other storage types +/// to increase shared memory reusability. +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = typename ::rocprim::detail::empty_storage_type; - #else +#else using storage_type = storage_type_; // only for Doxygen - #endif +#endif /// \brief Stores an arrangement of items from across the thread block into an /// arrangement on continuous memory. @@ -156,10 +154,8 @@ class block_store /// \par Overview /// * The type \p T must be such that an object of type \p InputIterator /// can be dereferenced and then implicitly converted to \p T. - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, T (&items)[ItemsPerThread]) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_store_direct_blocked(flat_id, block_output, items); @@ -178,11 +174,9 @@ class block_store /// \par Overview /// * The type \p T must be such that an object of type \p InputIterator /// can be dereferenced and then implicitly converted to \p T. - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_store_direct_blocked(flat_id, block_output, items, valid); @@ -218,13 +212,11 @@ class block_store /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, T (&items)[ItemsPerThread], storage_type& storage) { - (void) storage; + (void)storage; store(block_output, items); } @@ -260,14 +252,13 @@ class block_store /// ... /// } /// \endcode - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { - (void) storage; + (void)storage; store(block_output, items, valid); } }; @@ -277,86 +268,67 @@ class block_store #ifndef DOXYGEN_SHOULD_SKIP_THIS -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template class block_store { private: using storage_type_ = typename ::rocprim::detail::empty_storage_type; public: - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen using storage_type = typename ::rocprim::detail::empty_storage_type; - #else +#else using storage_type = storage_type_; // only for Doxygen - #endif +#endif - ROCPRIM_DEVICE inline - void store(T* block_output, - T (&items)[ItemsPerThread]) + ROCPRIM_DEVICE inline void store(T* block_output, T (&items)[ItemsPerThread]) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_store_direct_blocked_vectorized(flat_id, block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - U (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, U (&items)[ItemsPerThread]) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_store_direct_blocked(flat_id, block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_store_direct_blocked(flat_id, block_output, items, valid); } - ROCPRIM_DEVICE inline - void store(T* block_output, - T (&items)[ItemsPerThread], - storage_type& storage) + ROCPRIM_DEVICE inline void + store(T* block_output, T (&items)[ItemsPerThread], storage_type& storage) { - (void) storage; + (void)storage; store(block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - U (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, U (&items)[ItemsPerThread], storage_type& storage) { - (void) storage; + (void)storage; store(block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { - (void) storage; + (void)storage; store(block_output, items, valid); } }; -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template class block_store { private: @@ -365,46 +337,39 @@ class block_store - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, T (&items)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_striped(items, items, storage); block_store_direct_striped(flat_id, block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid) { ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_striped(items, items, storage); block_store_direct_striped(flat_id, block_output, items, valid); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, T (&items)[ItemsPerThread], storage_type& storage) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_striped(items, items, storage); block_store_direct_striped(flat_id, block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_striped(items, items, storage); @@ -412,11 +377,7 @@ class block_store +template class block_store { private: @@ -424,50 +385,43 @@ class block_store - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread]) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, T (&items)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_warp_striped(items, items, storage); block_store_direct_warp_striped(flat_id, block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, T (&items)[ItemsPerThread], unsigned int valid) { ROCPRIM_SHARED_MEMORY storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_warp_striped(items, items, storage); block_store_direct_warp_striped(flat_id, block_output, items, valid); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE inline void + store(OutputIterator block_output, T (&items)[ItemsPerThread], storage_type& storage) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_warp_striped(items, items, storage); block_store_direct_warp_striped(flat_id, block_output, items); } - template - ROCPRIM_DEVICE inline - void store(OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid, - storage_type& storage) + template + ROCPRIM_DEVICE inline void store(OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid, + storage_type& storage) { const unsigned int flat_id = ::rocprim::flat_block_thread_id(); block_exchange_type().blocked_to_warp_striped(items, items, storage); diff --git a/rocprim/include/rocprim/block/block_store_func.hpp b/rocprim/include/rocprim/block/block_store_func.hpp index cae30da45..d17bf1a73 100644 --- a/rocprim/include/rocprim/block/block_store_func.hpp +++ b/rocprim/include/rocprim/block/block_store_func.hpp @@ -24,8 +24,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -49,24 +49,19 @@ BEGIN_ROCPRIM_NAMESPACE /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_output - the input iterator from the thread block to store to /// \param items - array that data is stored to thread block -template< - class OutputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_store_direct_blocked(unsigned int flat_id, - OutputIterator block_output, - T (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline void block_store_direct_blocked(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread]) { static_assert(std::is_assignable::value, "The type T must be such that an object of type OutputIterator " "can be dereferenced and assigned a value of type T."); - unsigned int offset = flat_id * ItemsPerThread; + unsigned int offset = flat_id * ItemsPerThread; OutputIterator thread_iter = block_output + offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { thread_iter[item] = items[item]; } @@ -89,27 +84,22 @@ void block_store_direct_blocked(unsigned int flat_id, /// \param block_output - the input iterator from the thread block to store to /// \param items - array that data is stored to thread block /// \param valid - maximum range of valid numbers to store -template< - class OutputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_store_direct_blocked(unsigned int flat_id, - OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid) +template +ROCPRIM_DEVICE inline void block_store_direct_blocked(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid) { static_assert(std::is_assignable::value, "The type T must be such that an object of type OutputIterator " "can be dereferenced and assigned a value of type T."); - unsigned int offset = flat_id * ItemsPerThread; + unsigned int offset = flat_id * ItemsPerThread; OutputIterator thread_iter = block_output + offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { - if (item + offset < valid) + if(item + offset < valid) { thread_iter[item] = items[item]; } @@ -141,29 +131,24 @@ void block_store_direct_blocked(unsigned int flat_id, /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_output - the input iterator from the thread block to load from /// \param items - array that data is loaded to -template< - class T, - class U, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if()>::type -block_store_direct_blocked_vectorized(unsigned int flat_id, - T* block_output, - U (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline typename std::enable_if()>::type + block_store_direct_blocked_vectorized(unsigned int flat_id, + T* block_output, + U (&items)[ItemsPerThread]) { static_assert(std::is_convertible::value, "The type U must be such that it can be implicitly converted to T."); typedef typename detail::match_vector_type::type vector_type; constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type); - vector_type *vectors_ptr = reinterpret_cast(const_cast(block_output)); + vector_type* vectors_ptr = reinterpret_cast(const_cast(block_output)); vector_type raw_vector_items[vectors_per_thread]; - T *raw_items = reinterpret_cast(raw_vector_items); + T* raw_items = reinterpret_cast(raw_vector_items); - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { raw_items[item] = items[item]; } @@ -171,16 +156,11 @@ block_store_direct_blocked_vectorized(unsigned int flat_id, block_store_direct_blocked(flat_id, vectors_ptr, raw_vector_items); } -template< - class T, - class U, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if()>::type -block_store_direct_blocked_vectorized(unsigned int flat_id, - T* block_output, - U (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline typename std::enable_if()>::type + block_store_direct_blocked_vectorized(unsigned int flat_id, + T* block_output, + U (&items)[ItemsPerThread]) { block_store_direct_blocked(flat_id, block_output, items); } @@ -202,26 +182,20 @@ block_store_direct_blocked_vectorized(unsigned int flat_id, /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_output - the input iterator from the thread block to store to /// \param items - array that data is stored to thread block -template< - unsigned int BlockSize, - class OutputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_store_direct_striped(unsigned int flat_id, - OutputIterator block_output, - T (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline void block_store_direct_striped(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread]) { static_assert(std::is_assignable::value, "The type T must be such that an object of type OutputIterator " "can be dereferenced and assigned a value of type T."); OutputIterator thread_iter = block_output + flat_id; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { - thread_iter[item * BlockSize] = items[item]; + thread_iter[item * BlockSize] = items[item]; } } @@ -243,30 +217,24 @@ void block_store_direct_striped(unsigned int flat_id, /// \param block_output - the input iterator from the thread block to store to /// \param items - array that data is stored to thread block /// \param valid - maximum range of valid numbers to store -template< - unsigned int BlockSize, - class OutputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_store_direct_striped(unsigned int flat_id, - OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid) +template +ROCPRIM_DEVICE inline void block_store_direct_striped(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid) { static_assert(std::is_assignable::value, "The type T must be such that an object of type OutputIterator " "can be dereferenced and assigned a value of type T."); OutputIterator thread_iter = block_output + flat_id; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * BlockSize; - if (flat_id + offset < valid) + if(flat_id + offset < valid) { - thread_iter[offset] = items[item]; + thread_iter[offset] = items[item]; } } } @@ -295,31 +263,28 @@ void block_store_direct_striped(unsigned int flat_id, /// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread /// \param block_output - the input iterator from the thread block to store to /// \param items - array that data is stored to thread block -template< - unsigned int WarpSize = warp_size(), - class OutputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_store_direct_warp_striped(unsigned int flat_id, - OutputIterator block_output, - T (&items)[ItemsPerThread]) +template +ROCPRIM_DEVICE inline void block_store_direct_warp_striped(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread]) { static_assert(std::is_assignable::value, "The type T must be such that an object of type OutputIterator " "can be dereferenced and assigned a value of type T."); static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= warp_size(), - "WarpSize must be a power of two and equal or less" - "than the size of hardware warp."); - unsigned int thread_id = detail::logical_lane_id(); - unsigned int warp_id = flat_id / WarpSize; + "WarpSize must be a power of two and equal or less" + "than the size of hardware warp."); + unsigned int thread_id = detail::logical_lane_id(); + unsigned int warp_id = flat_id / WarpSize; unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; OutputIterator thread_iter = block_output + thread_id + warp_offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { thread_iter[item * WarpSize] = items[item]; } @@ -350,35 +315,32 @@ void block_store_direct_warp_striped(unsigned int flat_id, /// \param block_output - the input iterator from the thread block to store to /// \param items - array that data is stored to thread block /// \param valid - maximum range of valid numbers to store -template< - unsigned int WarpSize = warp_size(), - class OutputIterator, - class T, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -void block_store_direct_warp_striped(unsigned int flat_id, - OutputIterator block_output, - T (&items)[ItemsPerThread], - unsigned int valid) +template +ROCPRIM_DEVICE inline void block_store_direct_warp_striped(unsigned int flat_id, + OutputIterator block_output, + T (&items)[ItemsPerThread], + unsigned int valid) { static_assert(std::is_assignable::value, "The type T must be such that an object of type OutputIterator " "can be dereferenced and assigned a value of type T."); static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= warp_size(), - "WarpSize must be a power of two and equal or less" - "than the size of hardware warp."); - unsigned int thread_id = detail::logical_lane_id(); - unsigned int warp_id = flat_id / WarpSize; + "WarpSize must be a power of two and equal or less" + "than the size of hardware warp."); + unsigned int thread_id = detail::logical_lane_id(); + unsigned int warp_id = flat_id / WarpSize; unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; OutputIterator thread_iter = block_output + thread_id + warp_offset; - #pragma unroll - for (unsigned int item = 0; item < ItemsPerThread; item++) +#pragma unroll + for(unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * WarpSize; - if (warp_offset + thread_id + offset < valid) + if(warp_offset + thread_id + offset < valid) { thread_iter[offset] = items[item]; } diff --git a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp index df84c1c27..95fa6aab1 100644 --- a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp +++ b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp @@ -26,58 +26,49 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Bins -> -class block_histogram_atomic -{ - static_assert( - std::is_convertible::value, - "T must be convertible to unsigned int" - ); + template + class block_histogram_atomic + { + static_assert(std::is_convertible::value, + "T must be convertible to unsigned int"); -public: - using storage_type = typename ::rocprim::detail::empty_storage_type; + public: + using storage_type = typename ::rocprim::detail::empty_storage_type; - template - ROCPRIM_DEVICE inline - void composite(T (&input)[ItemsPerThread], - Counter hist[Bins]) - { - static_assert( - std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value, - "Counter must be type that is supported by atomics (float, int, unsigned int, unsigned long long)" - ); - #pragma unroll - for (unsigned int i = 0; i < ItemsPerThread; ++i) + template + ROCPRIM_DEVICE inline void composite(T (&input)[ItemsPerThread], Counter hist[Bins]) { - ::rocprim::detail::atomic_add(&hist[static_cast(input[i])], Counter(1)); + static_assert(std::is_same::value + || std::is_same::value + || std::is_same::value + || std::is_same::value, + "Counter must be type that is supported by atomics (float, int, unsigned " + "int, unsigned long long)"); +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + ::rocprim::detail::atomic_add(&hist[static_cast(input[i])], + Counter(1)); + } + ::rocprim::syncthreads(); } - ::rocprim::syncthreads(); - } - template - ROCPRIM_DEVICE inline - void composite(T (&input)[ItemsPerThread], - Counter hist[Bins], - storage_type& storage) - { - (void) storage; - this->composite(input, hist); - } -}; + template + ROCPRIM_DEVICE inline void + composite(T (&input)[ItemsPerThread], Counter hist[Bins], storage_type& storage) + { + (void)storage; + this->composite(input, hist); + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp b/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp index 3a791b4e8..ce55faad5 100644 --- a/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp +++ b/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp @@ -26,136 +26,124 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" -#include "../block_radix_sort.hpp" #include "../block_discontinuity.hpp" +#include "../block_radix_sort.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Bins -> -class block_histogram_sort -{ - static_assert( - std::is_convertible::value, - "T must be convertible to unsigned int" - ); - -private: - using radix_sort = block_radix_sort; - using discontinuity = block_discontinuity; - -public: - union storage_type_ + template + class block_histogram_sort { - typename radix_sort::storage_type sort; - struct - { - typename discontinuity::storage_type flag; - unsigned int start[Bins]; - unsigned int end[Bins]; - }; - }; + static_assert(std::is_convertible::value, + "T must be convertible to unsigned int"); - using storage_type = detail::raw_storage; + private: + using radix_sort = block_radix_sort; + using discontinuity = block_discontinuity; - template - ROCPRIM_DEVICE inline - void composite(T (&input)[ItemsPerThread], - Counter hist[Bins]) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->composite(input, hist, storage); - } - - template - ROCPRIM_DEVICE inline - void composite(T (&input)[ItemsPerThread], - Counter hist[Bins], - storage_type& storage) - { - static_assert( - std::is_convertible::value, - "unsigned int must be convertible to Counter" - ); - constexpr auto tile_size = BlockSize * ItemsPerThread; - const auto flat_tid = ::rocprim::flat_block_thread_id(); - unsigned int head_flags[ItemsPerThread]; - discontinuity_op flags_op(storage); - storage_type_& storage_ = storage.get(); - - radix_sort().sort(input, storage_.sort); - - #pragma unroll - for(unsigned int offset = 0; offset < Bins; offset += BlockSize) + public: + union storage_type_ { - const unsigned int offset_tid = offset + flat_tid; - if(offset_tid < Bins) + typename radix_sort::storage_type sort; + struct { - storage_.start[offset_tid] = tile_size; - storage_.end[offset_tid] = tile_size; - } - } - ::rocprim::syncthreads(); + typename discontinuity::storage_type flag; + unsigned int start[Bins]; + unsigned int end[Bins]; + }; + }; + + using storage_type = detail::raw_storage; - discontinuity().flag_heads(head_flags, input, flags_op, storage_.flag); - - // ::rocprim::syncthreads() isn't required here as input is sorted by this point - // and it's impossible that flags_op will be called where b = input[0] and a != b - if(flat_tid == 0) + template + ROCPRIM_DEVICE inline void composite(T (&input)[ItemsPerThread], Counter hist[Bins]) { - storage_.start[static_cast(input[0])] = 0; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->composite(input, hist, storage); } - ::rocprim::syncthreads(); - #pragma unroll - for(unsigned int offset = 0; offset < Bins; offset += BlockSize) + template + ROCPRIM_DEVICE inline void + composite(T (&input)[ItemsPerThread], Counter hist[Bins], storage_type& storage) { - const unsigned int offset_tid = offset + flat_tid; - if(offset_tid < Bins) + static_assert(std::is_convertible::value, + "unsigned int must be convertible to Counter"); + constexpr auto tile_size = BlockSize * ItemsPerThread; + const auto flat_tid = ::rocprim::flat_block_thread_id(); + unsigned int head_flags[ItemsPerThread]; + discontinuity_op flags_op(storage); + storage_type_& storage_ = storage.get(); + + radix_sort().sort(input, storage_.sort); + +#pragma unroll + for(unsigned int offset = 0; offset < Bins; offset += BlockSize) { - Counter count = static_cast(storage_.end[offset_tid] - storage_.start[offset_tid]); - hist[offset_tid] += count; + const unsigned int offset_tid = offset + flat_tid; + if(offset_tid < Bins) + { + storage_.start[offset_tid] = tile_size; + storage_.end[offset_tid] = tile_size; + } } - } - } + ::rocprim::syncthreads(); -private: - struct discontinuity_op - { - storage_type &storage; + discontinuity().flag_heads(head_flags, input, flags_op, storage_.flag); - ROCPRIM_DEVICE inline - discontinuity_op(storage_type &storage) : storage(storage) - { + // ::rocprim::syncthreads() isn't required here as input is sorted by this point + // and it's impossible that flags_op will be called where b = input[0] and a != b + if(flat_tid == 0) + { + storage_.start[static_cast(input[0])] = 0; + } + ::rocprim::syncthreads(); + +#pragma unroll + for(unsigned int offset = 0; offset < Bins; offset += BlockSize) + { + const unsigned int offset_tid = offset + flat_tid; + if(offset_tid < Bins) + { + Counter count = static_cast(storage_.end[offset_tid] + - storage_.start[offset_tid]); + hist[offset_tid] += count; + } + } } - ROCPRIM_DEVICE inline - bool operator()(const T& a, const T& b, unsigned int b_index) const + private: + struct discontinuity_op { - storage_type_& storage_ = storage.get(); - if(a != b) + storage_type& storage; + + ROCPRIM_DEVICE inline discontinuity_op(storage_type& storage) + : storage(storage) { - storage_.start[static_cast(b)] = b_index; - storage_.end[static_cast(a)] = b_index; - return true; } - else + + ROCPRIM_DEVICE inline bool + operator()(const T& a, const T& b, unsigned int b_index) const { - return false; + storage_type_& storage_ = storage.get(); + if(a != b) + { + storage_.start[static_cast(b)] = b_index; + storage_.end[static_cast(a)] = b_index; + return true; + } + else + { + return false; + } } - } + }; }; -}; } // end namespace detail diff --git a/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp index 24b6c6b00..606ed797f 100644 --- a/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp +++ b/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp @@ -26,8 +26,8 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../warp/warp_reduce.hpp" @@ -36,197 +36,165 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int BlockSize -> -class block_reduce_raking_reduce -{ - // Number of items to reduce per thread - static constexpr unsigned int thread_reduction_size_ = - (BlockSize + ::rocprim::warp_size() - 1)/ ::rocprim::warp_size(); - - // Warp reduce, warp_reduce_crosslane does not require shared memory (storage), but - // logical warp size must be a power of two. - static constexpr unsigned int warp_size_ = - detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); - // BlockSize is multiple of hardware warp - static constexpr bool block_size_smaller_than_warp_size_ = (BlockSize < warp_size_); - using warp_reduce_prefix_type = ::rocprim::detail::warp_reduce_crosslane; - - struct storage_type_ + template + class block_reduce_raking_reduce { - T threads[BlockSize]; - }; + // Number of items to reduce per thread + static constexpr unsigned int thread_reduction_size_ + = (BlockSize + ::rocprim::warp_size() - 1) / ::rocprim::warp_size(); + + // Warp reduce, warp_reduce_crosslane does not require shared memory (storage), but + // logical warp size must be a power of two. + static constexpr unsigned int warp_size_ + = detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); + // BlockSize is multiple of hardware warp + static constexpr bool block_size_smaller_than_warp_size_ = (BlockSize < warp_size_); + using warp_reduce_prefix_type + = ::rocprim::detail::warp_reduce_crosslane; + + struct storage_type_ + { + T threads[BlockSize]; + }; -public: - using storage_type = detail::raw_storage; + public: + using storage_type = detail::raw_storage; - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - storage_type& storage, - BinaryFunction reduce_op) - { - this->reduce_impl( - ::rocprim::flat_block_thread_id(), - input, output, storage, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - BinaryFunction reduce_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->reduce(input, output, storage, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void reduce(T (&input)[ItemsPerThread], - T& output, - storage_type& storage, - BinaryFunction reduce_op) - { - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) { - thread_input = reduce_op(thread_input, input[i]); + this->reduce_impl(::rocprim::flat_block_thread_id(), input, output, storage, reduce_op); } - // Reduction of reduced values to get partials - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->reduce_impl( - flat_tid, - thread_input, output, // input, output - storage, - reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce(T (&input)[ItemsPerThread], - T& output, - BinaryFunction reduce_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->reduce(input, output, storage, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - unsigned int valid_items, - storage_type& storage, - BinaryFunction reduce_op) - { - this->reduce_impl( - ::rocprim::flat_block_thread_id(), - input, output, valid_items, storage, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - unsigned int valid_items, - BinaryFunction reduce_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->reduce(input, output, valid_items, storage, reduce_op); - } - -private: - template - ROCPRIM_DEVICE inline - void reduce_impl(const unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction reduce_op) - { - storage_type_& storage_ = storage.get(); - storage_.threads[flat_tid] = input; - ::rocprim::syncthreads(); + template + ROCPRIM_DEVICE inline void reduce(T input, T& output, BinaryFunction reduce_op) + { + ROCPRIM_SHARED_MEMORY storage_type storage; + this->reduce(input, output, storage, reduce_op); + } - if (flat_tid < warp_size_) + template + ROCPRIM_DEVICE inline void reduce(T (&input)[ItemsPerThread], + T& output, + storage_type& storage, + BinaryFunction reduce_op) { - T thread_reduction = storage_.threads[flat_tid]; - for(unsigned int i = warp_size_ + flat_tid; i < BlockSize; i += warp_size_) + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) { - thread_reduction = reduce_op( - thread_reduction, storage_.threads[i] - ); + thread_input = reduce_op(thread_input, input[i]); } - warp_reduce( - thread_reduction, output, BlockSize, reduce_op - ); + + // Reduction of reduced values to get partials + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->reduce_impl(flat_tid, + thread_input, + output, // input, output + storage, + reduce_op); } - } - - template - ROCPRIM_DEVICE inline - auto warp_reduce(T input, - T& output, - const unsigned int valid_items, - BinaryFunction reduce_op) - -> typename std::enable_if::type - { - WarpReduce().reduce( - input, output, valid_items, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - auto warp_reduce(T input, - T& output, - const unsigned int valid_items, - BinaryFunction reduce_op) - -> typename std::enable_if::type - { - (void) valid_items; - WarpReduce().reduce( - input, output, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce_impl(const unsigned int flat_tid, - T input, - T& output, - const unsigned int valid_items, - storage_type& storage, - BinaryFunction reduce_op) - { - storage_type_& storage_ = storage.get(); - storage_.threads[flat_tid] = input; - ::rocprim::syncthreads(); - if (flat_tid < warp_size_) + template + ROCPRIM_DEVICE inline void + reduce(T (&input)[ItemsPerThread], T& output, BinaryFunction reduce_op) { - T thread_reduction = storage_.threads[flat_tid]; - for(unsigned int i = warp_size_ + flat_tid; i < BlockSize; i += warp_size_) + ROCPRIM_SHARED_MEMORY storage_type storage; + this->reduce(input, output, storage, reduce_op); + } + + template + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op) + { + this->reduce_impl( + ::rocprim::flat_block_thread_id(), input, output, valid_items, storage, reduce_op); + } + + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op) + { + ROCPRIM_SHARED_MEMORY storage_type storage; + this->reduce(input, output, valid_items, storage, reduce_op); + } + + private: + template + ROCPRIM_DEVICE inline void reduce_impl(const unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction reduce_op) + { + storage_type_& storage_ = storage.get(); + storage_.threads[flat_tid] = input; + ::rocprim::syncthreads(); + + if(flat_tid < warp_size_) { - if(i < valid_items) + T thread_reduction = storage_.threads[flat_tid]; + for(unsigned int i = warp_size_ + flat_tid; i < BlockSize; i += warp_size_) { thread_reduction = reduce_op(thread_reduction, storage_.threads[i]); } + warp_reduce( + thread_reduction, output, BlockSize, reduce_op); } - warp_reduce_prefix_type().reduce(thread_reduction, output, valid_items, reduce_op); } - } -}; + + template + ROCPRIM_DEVICE inline auto warp_reduce(T input, + T& output, + const unsigned int valid_items, + BinaryFunction reduce_op) -> + typename std::enable_if::type + { + WarpReduce().reduce(input, output, valid_items, reduce_op); + } + + template + ROCPRIM_DEVICE inline auto warp_reduce(T input, + T& output, + const unsigned int valid_items, + BinaryFunction reduce_op) -> + typename std::enable_if::type + { + (void)valid_items; + WarpReduce().reduce(input, output, reduce_op); + } + + template + ROCPRIM_DEVICE inline void reduce_impl(const unsigned int flat_tid, + T input, + T& output, + const unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op) + { + storage_type_& storage_ = storage.get(); + storage_.threads[flat_tid] = input; + ::rocprim::syncthreads(); + + if(flat_tid < warp_size_) + { + T thread_reduction = storage_.threads[flat_tid]; + for(unsigned int i = warp_size_ + flat_tid; i < BlockSize; i += warp_size_) + { + if(i < valid_items) + { + thread_reduction = reduce_op(thread_reduction, storage_.threads[i]); + } + } + warp_reduce_prefix_type().reduce(thread_reduction, output, valid_items, reduce_op); + } + } + }; } // end namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp index 11afb7bf8..c956be768 100644 --- a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp +++ b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp @@ -26,8 +26,8 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../warp/warp_reduce.hpp" @@ -36,230 +36,193 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int BlockSize -> -class block_reduce_warp_reduce -{ - // Select warp size - static constexpr unsigned int warp_size_ = - detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); - // Number of warps in block - static constexpr unsigned int warps_no_ = (BlockSize + warp_size_ - 1) / warp_size_; - - // Check if we have to pass number of valid items into warp reduction primitive - static constexpr bool block_size_is_warp_multiple_ = ((BlockSize % warp_size_) == 0); - static constexpr bool warps_no_is_pow_of_two_ = detail::is_power_of_two(warps_no_); - - // typedef of warp_reduce primitive that will be used to perform warp-level - // reduce operation on input values. - // warp_reduce_crosslane is an implementation of warp_reduce that does not need storage, - // but requires logical warp size to be a power of two. - using warp_reduce_input_type = ::rocprim::detail::warp_reduce_crosslane; - // typedef of warp_reduce primitive that will be used to perform reduction - // of results of warp-level reduction. - using warp_reduce_output_type = ::rocprim::detail::warp_reduce_crosslane< - T, detail::next_power_of_two(warps_no_), false - >; - - struct storage_type_ + template + class block_reduce_warp_reduce { - T warp_partials[warps_no_]; - }; + // Select warp size + static constexpr unsigned int warp_size_ + = detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); + // Number of warps in block + static constexpr unsigned int warps_no_ = (BlockSize + warp_size_ - 1) / warp_size_; + + // Check if we have to pass number of valid items into warp reduction primitive + static constexpr bool block_size_is_warp_multiple_ = ((BlockSize % warp_size_) == 0); + static constexpr bool warps_no_is_pow_of_two_ = detail::is_power_of_two(warps_no_); + + // typedef of warp_reduce primitive that will be used to perform warp-level + // reduce operation on input values. + // warp_reduce_crosslane is an implementation of warp_reduce that does not need storage, + // but requires logical warp size to be a power of two. + using warp_reduce_input_type + = ::rocprim::detail::warp_reduce_crosslane; + // typedef of warp_reduce primitive that will be used to perform reduction + // of results of warp-level reduction. + using warp_reduce_output_type = ::rocprim::detail:: + warp_reduce_crosslane; + + struct storage_type_ + { + T warp_partials[warps_no_]; + }; -public: - using storage_type = detail::raw_storage; + public: + using storage_type = detail::raw_storage; - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - storage_type& storage, - BinaryFunction reduce_op) - { - this->reduce_impl( - ::rocprim::flat_block_thread_id(), - input, output, storage, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - BinaryFunction reduce_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->reduce(input, output, storage, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void reduce(T (&input)[ItemsPerThread], - T& output, - storage_type& storage, - BinaryFunction reduce_op) - { - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) { - thread_input = reduce_op(thread_input, input[i]); + this->reduce_impl(::rocprim::flat_block_thread_id(), input, output, storage, reduce_op); } - // Reduction of reduced values to get partials - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->reduce_impl( - flat_tid, - thread_input, output, // input, output - storage, - reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce(T (&input)[ItemsPerThread], - T& output, - BinaryFunction reduce_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->reduce(input, output, storage, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - unsigned int valid_items, - storage_type& storage, - BinaryFunction reduce_op) - { - this->reduce_impl( - ::rocprim::flat_block_thread_id(), - input, output, valid_items, storage, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - unsigned int valid_items, - BinaryFunction reduce_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->reduce(input, output, valid_items, storage, reduce_op); - } - -private: - template - ROCPRIM_DEVICE inline - void reduce_impl(const unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction reduce_op) - { - const auto warp_id = ::rocprim::warp_id(); - const auto lane_id = ::rocprim::lane_id(); - const unsigned int warp_offset = warp_id * warp_size_; - const unsigned int num_valid = - (warp_offset < BlockSize) ? BlockSize - warp_offset : 0; - storage_type_& storage_ = storage.get(); - - // Perform warp reduce - warp_reduce( - input, output, num_valid, reduce_op - ); - - // i-th warp will have its partial stored in storage_.warp_partials[i-1] - if(lane_id == 0) + template + ROCPRIM_DEVICE inline void reduce(T input, T& output, BinaryFunction reduce_op) { - storage_.warp_partials[warp_id] = output; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->reduce(input, output, storage, reduce_op); } - ::rocprim::syncthreads(); - if(flat_tid < warps_no_) + template + ROCPRIM_DEVICE inline void reduce(T (&input)[ItemsPerThread], + T& output, + storage_type& storage, + BinaryFunction reduce_op) { - // Use warp partial to calculate the final reduce results for every thread - auto warp_partial = storage_.warp_partials[lane_id]; + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + thread_input = reduce_op(thread_input, input[i]); + } + + // Reduction of reduced values to get partials + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->reduce_impl(flat_tid, + thread_input, + output, // input, output + storage, + reduce_op); + } - warp_reduce( - warp_partial, output, warps_no_, reduce_op - ); + template + ROCPRIM_DEVICE inline void + reduce(T (&input)[ItemsPerThread], T& output, BinaryFunction reduce_op) + { + ROCPRIM_SHARED_MEMORY storage_type storage; + this->reduce(input, output, storage, reduce_op); } - } - - template - ROCPRIM_DEVICE inline - auto warp_reduce(T input, - T& output, - const unsigned int valid_items, - BinaryFunction reduce_op) - -> typename std::enable_if::type - { - WarpReduce().reduce( - input, output, valid_items, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - auto warp_reduce(T input, - T& output, - const unsigned int valid_items, - BinaryFunction reduce_op) - -> typename std::enable_if::type - { - (void) valid_items; - WarpReduce().reduce( - input, output, reduce_op - ); - } - - template - ROCPRIM_DEVICE inline - void reduce_impl(const unsigned int flat_tid, - T input, - T& output, - const unsigned int valid_items, - storage_type& storage, - BinaryFunction reduce_op) - { - const auto warp_id = ::rocprim::warp_id(); - const auto lane_id = ::rocprim::lane_id(); - const unsigned int warp_offset = warp_id * warp_size_; - const unsigned int num_valid = - (warp_offset < valid_items) ? valid_items - warp_offset : 0; - storage_type_& storage_ = storage.get(); - - // Perform warp reduce - warp_reduce_input_type().reduce( - input, output, num_valid, reduce_op - ); - - // i-th warp will have its partial stored in storage_.warp_partials[i-1] - if(lane_id == 0) + + template + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op) + { + this->reduce_impl( + ::rocprim::flat_block_thread_id(), input, output, valid_items, storage, reduce_op); + } + + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op) { - storage_.warp_partials[warp_id] = output; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->reduce(input, output, valid_items, storage, reduce_op); } - ::rocprim::syncthreads(); - if(flat_tid < warps_no_) + private: + template + ROCPRIM_DEVICE inline void reduce_impl(const unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction reduce_op) { - // Use warp partial to calculate the final reduce results for every thread - auto warp_partial = storage_.warp_partials[lane_id]; + const auto warp_id = ::rocprim::warp_id(); + const auto lane_id = ::rocprim::lane_id(); + const unsigned int warp_offset = warp_id * warp_size_; + const unsigned int num_valid = (warp_offset < BlockSize) ? BlockSize - warp_offset : 0; + storage_type_& storage_ = storage.get(); + + // Perform warp reduce + warp_reduce( + input, output, num_valid, reduce_op); + + // i-th warp will have its partial stored in storage_.warp_partials[i-1] + if(lane_id == 0) + { + storage_.warp_partials[warp_id] = output; + } + ::rocprim::syncthreads(); + + if(flat_tid < warps_no_) + { + // Use warp partial to calculate the final reduce results for every thread + auto warp_partial = storage_.warp_partials[lane_id]; + + warp_reduce( + warp_partial, output, warps_no_, reduce_op); + } + } - unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_; - warp_reduce_output_type().reduce( - warp_partial, output, valid_warps_no, reduce_op - ); + template + ROCPRIM_DEVICE inline auto warp_reduce(T input, + T& output, + const unsigned int valid_items, + BinaryFunction reduce_op) -> + typename std::enable_if::type + { + WarpReduce().reduce(input, output, valid_items, reduce_op); } - } -}; + + template + ROCPRIM_DEVICE inline auto warp_reduce(T input, + T& output, + const unsigned int valid_items, + BinaryFunction reduce_op) -> + typename std::enable_if::type + { + (void)valid_items; + WarpReduce().reduce(input, output, reduce_op); + } + + template + ROCPRIM_DEVICE inline void reduce_impl(const unsigned int flat_tid, + T input, + T& output, + const unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op) + { + const auto warp_id = ::rocprim::warp_id(); + const auto lane_id = ::rocprim::lane_id(); + const unsigned int warp_offset = warp_id * warp_size_; + const unsigned int num_valid + = (warp_offset < valid_items) ? valid_items - warp_offset : 0; + storage_type_& storage_ = storage.get(); + + // Perform warp reduce + warp_reduce_input_type().reduce(input, output, num_valid, reduce_op); + + // i-th warp will have its partial stored in storage_.warp_partials[i-1] + if(lane_id == 0) + { + storage_.warp_partials[warp_id] = output; + } + ::rocprim::syncthreads(); + + if(flat_tid < warps_no_) + { + // Use warp partial to calculate the final reduce results for every thread + auto warp_partial = storage_.warp_partials[lane_id]; + + unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_; + warp_reduce_output_type().reduce(warp_partial, output, valid_warps_no, reduce_op); + } + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp b/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp index 36868d34b..8d0a8163b 100644 --- a/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp +++ b/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp @@ -26,8 +26,8 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../warp/warp_scan.hpp" @@ -36,590 +36,522 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int BlockSize -> -class block_scan_reduce_then_scan -{ - // Number of items to reduce per thread - static constexpr unsigned int thread_reduction_size_ = - (BlockSize + ::rocprim::warp_size() - 1)/ ::rocprim::warp_size(); - - // Warp scan, warp_scan_crosslane does not require shared memory (storage), but - // logical warp size must be a power of two. - static constexpr unsigned int warp_size_ = - detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); - using warp_scan_prefix_type = ::rocprim::detail::warp_scan_crosslane; - - // Minimize LDS bank conflicts - static constexpr unsigned int banks_no_ = ::rocprim::detail::get_lds_banks_no(); - static constexpr bool has_bank_conflicts_ = - ::rocprim::detail::is_power_of_two(thread_reduction_size_) && thread_reduction_size_ > 1; - static constexpr unsigned int bank_conflicts_padding = - has_bank_conflicts_ ? (warp_size_ * thread_reduction_size_ / banks_no_) : 0; - - struct storage_type_ + template + class block_scan_reduce_then_scan { - T threads[warp_size_ * thread_reduction_size_ + bank_conflicts_padding]; - }; + // Number of items to reduce per thread + static constexpr unsigned int thread_reduction_size_ + = (BlockSize + ::rocprim::warp_size() - 1) / ::rocprim::warp_size(); + + // Warp scan, warp_scan_crosslane does not require shared memory (storage), but + // logical warp size must be a power of two. + static constexpr unsigned int warp_size_ + = detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); + using warp_scan_prefix_type = ::rocprim::detail::warp_scan_crosslane; + + // Minimize LDS bank conflicts + static constexpr unsigned int banks_no_ = ::rocprim::detail::get_lds_banks_no(); + static constexpr bool has_bank_conflicts_ + = ::rocprim::detail::is_power_of_two(thread_reduction_size_) + && thread_reduction_size_ > 1; + static constexpr unsigned int bank_conflicts_padding + = has_bank_conflicts_ ? (warp_size_ * thread_reduction_size_ / banks_no_) : 0; + + struct storage_type_ + { + T threads[warp_size_ * thread_reduction_size_ + bank_conflicts_padding]; + }; -public: - using storage_type = detail::raw_storage; + public: + using storage_type = detail::raw_storage; - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - { - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - this->inclusive_scan(input, output, storage, scan_op); - reduction = storage_.threads[index(BlockSize - 1)]; - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, reduction, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - const auto flat_tid = ::rocprim::flat_block_thread_id(); - const auto warp_id = ::rocprim::warp_id(); - storage_type_& storage_ = storage.get(); - this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op); - // Include block prefix (this operation overwrites storage_.threads[0]) - T block_prefix = this->get_block_prefix( - flat_tid, warp_id, - storage_.threads[index(BlockSize - 1)], // block reduction - prefix_callback_op, storage - ); - output = scan_op(block_prefix, output); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - BinaryFunction scan_op) - { - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) { - thread_input = scan_op(thread_input, input[i]); + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op); } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - storage, - scan_op - ); - - // Include prefix (first thread does not have prefix) - output[0] = input[0]; - if(flat_tid != 0) output[0] = scan_op(thread_input, input[0]); - // Final thread-local scan - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void inclusive_scan(T input, T& output, BinaryFunction scan_op) { - output[i] = scan_op(output[i-1], input[i]); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, storage, scan_op); } - } - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - this->inclusive_scan(input, output, storage, scan_op); - // Save reduction result - reduction = storage_.threads[index(BlockSize - 1)]; - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, reduction, storage, scan_op); - } - - template< - class PrefixCallback, - unsigned int ItemsPerThread, - class BinaryFunction - > - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void inclusive_scan( + T input, T& output, T& reduction, storage_type& storage, BinaryFunction scan_op) { - thread_input = scan_op(thread_input, input[i]); + storage_type_& storage_ = storage.get(); + this->inclusive_scan(input, output, storage, scan_op); + reduction = storage_.threads[index(BlockSize - 1)]; } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - storage, - scan_op - ); - - // this operation overwrites storage_.threads[0] - T block_prefix = this->get_block_prefix( - flat_tid, ::rocprim::warp_id(), - storage_.threads[index(BlockSize - 1)], // block reduction - prefix_callback_op, storage - ); - - // Include prefix (first thread does not have prefix) - output[0] = input[0]; - if(flat_tid != 0) output[0] = scan_op(thread_input, input[0]); - // Include block prefix - output[0] = scan_op(block_prefix, output[0]); - // Final thread-local scan - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, T& reduction, BinaryFunction scan_op) { - output[i] = scan_op(output[i-1], input[i]); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, reduction, storage, scan_op); } - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - storage_type& storage, - BinaryFunction scan_op) - { - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl(flat_tid, input, output, init, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan(input, output, init, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - const auto flat_tid = ::rocprim::flat_block_thread_id(); - storage_type_& storage_ = storage.get(); - this->exclusive_scan_impl( - flat_tid, input, output, init, storage, scan_op - ); - // Save reduction result - reduction = storage_.threads[index(BlockSize - 1)]; - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan(input, output, init, reduction, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - const auto flat_tid = ::rocprim::flat_block_thread_id(); - const auto warp_id = ::rocprim::warp_id(); - storage_type_& storage_ = storage.get(); - this->exclusive_scan_impl( - flat_tid, input, output, storage, scan_op - ); - // Get reduction result - T reduction = storage_.threads[index(BlockSize - 1)]; - // Include block prefix (this operation overwrites storage_.threads[0]) - T block_prefix = this->get_block_prefix( - flat_tid, warp_id, reduction, - prefix_callback_op, storage - ); - output = scan_op(block_prefix, output); - if(flat_tid == 0) output = block_prefix; - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - storage_type& storage, - BinaryFunction scan_op) - { - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void inclusive_scan(T input, + T& output, + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) + { + const auto flat_tid = ::rocprim::flat_block_thread_id(); + const auto warp_id = ::rocprim::warp_id(); + storage_type_& storage_ = storage.get(); + this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op); + // Include block prefix (this operation overwrites storage_.threads[0]) + T block_prefix + = this->get_block_prefix(flat_tid, + warp_id, + storage_.threads[index(BlockSize - 1)], // block reduction + prefix_callback_op, + storage); + output = scan_op(block_prefix, output); + } + + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + BinaryFunction scan_op) + { + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + thread_input = scan_op(thread_input, input[i]); + } + + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + storage, + scan_op); + + // Include prefix (first thread does not have prefix) + output[0] = input[0]; + if(flat_tid != 0) + output[0] = scan_op(thread_input, input[0]); +// Final thread-local scan +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + output[i] = scan_op(output[i - 1], input[i]); + } + } + + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + BinaryFunction scan_op) { - thread_input = scan_op(thread_input, input[i]); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, storage, scan_op); } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - init, - storage, - scan_op - ); - - // Include init value - T prev = input[0]; - T exclusive = init; - if(flat_tid != 0) + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& reduction, + storage_type& storage, + BinaryFunction scan_op) { - exclusive = thread_input; + storage_type_& storage_ = storage.get(); + this->inclusive_scan(input, output, storage, scan_op); + // Save reduction result + reduction = storage_.threads[index(BlockSize - 1)]; } - output[0] = exclusive; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& reduction, + BinaryFunction scan_op) { - exclusive = scan_op(exclusive, prev); - prev = input[i]; - output[i] = exclusive; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, reduction, storage, scan_op); } - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan(input, output, init, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - this->exclusive_scan(input, output, init, storage, scan_op); - // Save reduction result - reduction = storage_.threads[index(BlockSize - 1)]; - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan(input, output, init, reduction, storage, scan_op); - } - - template< - class PrefixCallback, - unsigned int ItemsPerThread, - class BinaryFunction - > - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { - thread_input = scan_op(thread_input, input[i]); + storage_type_& storage_ = storage.get(); + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + thread_input = scan_op(thread_input, input[i]); + } + + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + storage, + scan_op); + + // this operation overwrites storage_.threads[0] + T block_prefix + = this->get_block_prefix(flat_tid, + ::rocprim::warp_id(), + storage_.threads[index(BlockSize - 1)], // block reduction + prefix_callback_op, + storage); + + // Include prefix (first thread does not have prefix) + output[0] = input[0]; + if(flat_tid != 0) + output[0] = scan_op(thread_input, input[0]); + // Include block prefix + output[0] = scan_op(block_prefix, output[0]); +// Final thread-local scan +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + output[i] = scan_op(output[i - 1], input[i]); + } } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - storage, - scan_op - ); - - // this operation overwrites storage_.warp_prefixes[0] - T block_prefix = this->get_block_prefix( - flat_tid, ::rocprim::warp_id(), - storage_.threads[index(BlockSize - 1)], // block reduction - prefix_callback_op, storage - ); - - // Include init value and block prefix - T prev = input[0]; - T exclusive = block_prefix; - if(flat_tid != 0) + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, storage_type& storage, BinaryFunction scan_op) { - exclusive = scan_op(block_prefix, thread_input); + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, input, output, init, storage, scan_op); } - output[0] = exclusive; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, BinaryFunction scan_op) { - exclusive = scan_op(exclusive, prev); - prev = input[i]; - output[i] = exclusive; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, storage, scan_op); } - } - -private: - - // Calculates inclusive scan results and stores them in storage_.threads, - // result for each thread is stored in storage_.threads[flat_tid], and sets - // output to storage_.threads[flat_tid] - template - ROCPRIM_DEVICE inline - void inclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Calculate inclusive scan, - // result for each thread is stored in storage_.threads[flat_tid] - this->inclusive_scan_base(flat_tid, input, storage, scan_op); - output = storage_.threads[index(flat_tid)]; - } - - // Calculates inclusive scan results and stores them in storage_.threads, - // result for each thread is stored in storage_.threads[flat_tid] - template - ROCPRIM_DEVICE inline - void inclusive_scan_base(const unsigned int flat_tid, - T input, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - storage_.threads[index(flat_tid)] = input; - ::rocprim::syncthreads(); - if(flat_tid < warp_size_) + + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, T& reduction, storage_type& storage, BinaryFunction scan_op) + { + const auto flat_tid = ::rocprim::flat_block_thread_id(); + storage_type_& storage_ = storage.get(); + this->exclusive_scan_impl(flat_tid, input, output, init, storage, scan_op); + // Save reduction result + reduction = storage_.threads[index(BlockSize - 1)]; + } + + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, T& reduction, BinaryFunction scan_op) { - const unsigned int idx_start = index(flat_tid * thread_reduction_size_); - const unsigned int idx_end = idx_start + thread_reduction_size_; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, reduction, storage, scan_op); + } - T thread_reduction = storage_.threads[idx_start]; - #pragma unroll - for(unsigned int i = idx_start + 1; i < idx_end; i++) + template + ROCPRIM_DEVICE inline void exclusive_scan(T input, + T& output, + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) + { + const auto flat_tid = ::rocprim::flat_block_thread_id(); + const auto warp_id = ::rocprim::warp_id(); + storage_type_& storage_ = storage.get(); + this->exclusive_scan_impl(flat_tid, input, output, storage, scan_op); + // Get reduction result + T reduction = storage_.threads[index(BlockSize - 1)]; + // Include block prefix (this operation overwrites storage_.threads[0]) + T block_prefix + = this->get_block_prefix(flat_tid, warp_id, reduction, prefix_callback_op, storage); + output = scan_op(block_prefix, output); + if(flat_tid == 0) + output = block_prefix; + } + + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + storage_type& storage, + BinaryFunction scan_op) + { + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) { - thread_reduction = scan_op( - thread_reduction, storage_.threads[i] - ); + thread_input = scan_op(thread_input, input[i]); } - // Calculate warp prefixes - warp_scan_prefix_type().inclusive_scan(thread_reduction, thread_reduction, scan_op); - thread_reduction = warp_shuffle_up(thread_reduction, 1, warp_size_); + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + init, + storage, + scan_op); + + // Include init value + T prev = input[0]; + T exclusive = init; + if(flat_tid != 0) + { + exclusive = thread_input; + } + output[0] = exclusive; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + exclusive = scan_op(exclusive, prev); + prev = input[i]; + output[i] = exclusive; + } + } - // Include warp prefix - thread_reduction = scan_op(thread_reduction, storage_.threads[idx_start]); - if(flat_tid == 0) + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + BinaryFunction scan_op) + { + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, storage, scan_op); + } + + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op) + { + storage_type_& storage_ = storage.get(); + this->exclusive_scan(input, output, init, storage, scan_op); + // Save reduction result + reduction = storage_.threads[index(BlockSize - 1)]; + } + + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + T& reduction, + BinaryFunction scan_op) + { + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, reduction, storage, scan_op); + } + + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) + { + storage_type_& storage_ = storage.get(); + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) { - thread_reduction = input; + thread_input = scan_op(thread_input, input[i]); } - storage_.threads[idx_start] = thread_reduction; - #pragma unroll - for(unsigned int i = idx_start + 1; i < idx_end; i++) + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + storage, + scan_op); + + // this operation overwrites storage_.warp_prefixes[0] + T block_prefix + = this->get_block_prefix(flat_tid, + ::rocprim::warp_id(), + storage_.threads[index(BlockSize - 1)], // block reduction + prefix_callback_op, + storage); + + // Include init value and block prefix + T prev = input[0]; + T exclusive = block_prefix; + if(flat_tid != 0) + { + exclusive = scan_op(block_prefix, thread_input); + } + output[0] = exclusive; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) { - thread_reduction = scan_op( - thread_reduction, storage_.threads[i] - ); - storage_.threads[i] = thread_reduction; + exclusive = scan_op(exclusive, prev); + prev = input[i]; + output[i] = exclusive; } } - ::rocprim::syncthreads(); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - T init, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Calculates inclusive scan, result for each thread is stored in storage_.threads[flat_tid] - this->inclusive_scan_base(flat_tid, input, storage, scan_op); - output = init; - if(flat_tid != 0) output = scan_op(init, storage_.threads[index(flat_tid-1)]); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Calculates inclusive scan, result for each thread is stored in storage_.threads[flat_tid] - this->inclusive_scan_base(flat_tid, input, storage, scan_op); - if(flat_tid > 0) + + private: + // Calculates inclusive scan results and stores them in storage_.threads, + // result for each thread is stored in storage_.threads[flat_tid], and sets + // output to storage_.threads[flat_tid] + template + ROCPRIM_DEVICE inline void inclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction scan_op) { - output = storage_.threads[index(flat_tid-1)]; + storage_type_& storage_ = storage.get(); + // Calculate inclusive scan, + // result for each thread is stored in storage_.threads[flat_tid] + this->inclusive_scan_base(flat_tid, input, storage, scan_op); + output = storage_.threads[index(flat_tid)]; } - } - - // OVERWRITES storage_.threads[0] - template - ROCPRIM_DEVICE inline - void include_block_prefix(const unsigned int flat_tid, - const unsigned int warp_id, - const T input, - T& output, - const T reduction, - PrefixCallback& prefix_callback_op, - storage_type& storage, - BinaryFunction scan_op) - { - T block_prefix = this->get_block_prefix( - flat_tid, warp_id, reduction, - prefix_callback_op, storage - ); - output = scan_op(block_prefix, input); - } - - // OVERWRITES storage_.threads[0] - template - ROCPRIM_DEVICE inline - T get_block_prefix(const unsigned int flat_tid, - const unsigned int warp_id, - const T reduction, - PrefixCallback& prefix_callback_op, - storage_type& storage) - { - storage_type_& storage_ = storage.get(); - if(warp_id == 0) + + // Calculates inclusive scan results and stores them in storage_.threads, + // result for each thread is stored in storage_.threads[flat_tid] + template + ROCPRIM_DEVICE inline void inclusive_scan_base(const unsigned int flat_tid, + T input, + storage_type& storage, + BinaryFunction scan_op) { - T block_prefix = prefix_callback_op(reduction); - if(flat_tid == 0) + storage_type_& storage_ = storage.get(); + storage_.threads[index(flat_tid)] = input; + ::rocprim::syncthreads(); + if(flat_tid < warp_size_) { - // Reuse storage_.threads[0] which should not be - // needed at that point. - storage_.threads[0] = block_prefix; + const unsigned int idx_start = index(flat_tid * thread_reduction_size_); + const unsigned int idx_end = idx_start + thread_reduction_size_; + + T thread_reduction = storage_.threads[idx_start]; +#pragma unroll + for(unsigned int i = idx_start + 1; i < idx_end; i++) + { + thread_reduction = scan_op(thread_reduction, storage_.threads[i]); + } + + // Calculate warp prefixes + warp_scan_prefix_type().inclusive_scan(thread_reduction, thread_reduction, scan_op); + thread_reduction = warp_shuffle_up(thread_reduction, 1, warp_size_); + + // Include warp prefix + thread_reduction = scan_op(thread_reduction, storage_.threads[idx_start]); + if(flat_tid == 0) + { + thread_reduction = input; + } + + storage_.threads[idx_start] = thread_reduction; +#pragma unroll + for(unsigned int i = idx_start + 1; i < idx_end; i++) + { + thread_reduction = scan_op(thread_reduction, storage_.threads[i]); + storage_.threads[i] = thread_reduction; + } } + ::rocprim::syncthreads(); } - ::rocprim::syncthreads(); - return storage_.threads[0]; - } - // Change index to minimize LDS bank conflicts if necessary - ROCPRIM_DEVICE inline - unsigned int index(unsigned int n) const - { - // Move every 32-bank wide "row" (32 banks * 4 bytes) by one item - return has_bank_conflicts_ ? (n + (n/banks_no_)) : n; - } -}; + template + ROCPRIM_DEVICE inline void exclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + T init, + storage_type& storage, + BinaryFunction scan_op) + { + storage_type_& storage_ = storage.get(); + // Calculates inclusive scan, result for each thread is stored in storage_.threads[flat_tid] + this->inclusive_scan_base(flat_tid, input, storage, scan_op); + output = init; + if(flat_tid != 0) + output = scan_op(init, storage_.threads[index(flat_tid - 1)]); + } + + template + ROCPRIM_DEVICE inline void exclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction scan_op) + { + storage_type_& storage_ = storage.get(); + // Calculates inclusive scan, result for each thread is stored in storage_.threads[flat_tid] + this->inclusive_scan_base(flat_tid, input, storage, scan_op); + if(flat_tid > 0) + { + output = storage_.threads[index(flat_tid - 1)]; + } + } + + // OVERWRITES storage_.threads[0] + template + ROCPRIM_DEVICE inline void include_block_prefix(const unsigned int flat_tid, + const unsigned int warp_id, + const T input, + T& output, + const T reduction, + PrefixCallback& prefix_callback_op, + storage_type& storage, + BinaryFunction scan_op) + { + T block_prefix + = this->get_block_prefix(flat_tid, warp_id, reduction, prefix_callback_op, storage); + output = scan_op(block_prefix, input); + } + + // OVERWRITES storage_.threads[0] + template + ROCPRIM_DEVICE inline T get_block_prefix(const unsigned int flat_tid, + const unsigned int warp_id, + const T reduction, + PrefixCallback& prefix_callback_op, + storage_type& storage) + { + storage_type_& storage_ = storage.get(); + if(warp_id == 0) + { + T block_prefix = prefix_callback_op(reduction); + if(flat_tid == 0) + { + // Reuse storage_.threads[0] which should not be + // needed at that point. + storage_.threads[0] = block_prefix; + } + } + ::rocprim::syncthreads(); + return storage_.threads[0]; + } + + // Change index to minimize LDS bank conflicts if necessary + ROCPRIM_DEVICE inline unsigned int index(unsigned int n) const + { + // Move every 32-bank wide "row" (32 banks * 4 bytes) by one item + return has_bank_conflicts_ ? (n + (n / banks_no_)) : n; + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp b/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp index fe227160d..3b8ced074 100644 --- a/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp +++ b/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp @@ -26,8 +26,8 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../warp/warp_scan.hpp" @@ -36,708 +36,645 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int BlockSize -> -class block_scan_warp_scan -{ - // Select warp size - static constexpr unsigned int warp_size_ = - detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); - // Number of warps in block - static constexpr unsigned int warps_no_ = (BlockSize + warp_size_ - 1) / warp_size_; - - // typedef of warp_scan primitive that will be used to perform warp-level - // inclusive/exclusive scan operations on input values. - // warp_scan_crosslane is an implementation of warp_scan that does not need storage, - // but requires logical warp size to be a power of two. - using warp_scan_input_type = ::rocprim::detail::warp_scan_crosslane; - // typedef of warp_scan primitive that will be used to get prefix values for - // each warp (scanned carry-outs from warps before it). - using warp_scan_prefix_type = ::rocprim::detail::warp_scan_crosslane; - - struct storage_type_ - { - T warp_prefixes[warps_no_]; - // ---------- Shared memory optimisation ---------- - // Since warp_scan_input and warp_scan_prefix are typedef of warp_scan_crosslane, - // we don't need to allocate any temporary memory for them. - // If we just use warp_scan, we would need to add following union to this struct: - // union - // { - // typename warp_scan_input::storage_type wscan[warps_no_]; - // typename warp_scan_prefix::storage_type wprefix_scan; - // }; - // and use storage_.wscan[warp_id] and storage.wprefix_scan when calling - // warp_scan_input().inclusive_scan(..) and warp_scan_prefix().inclusive_scan(..). - }; - -public: - using storage_type = detail::raw_storage; - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - { - this->inclusive_scan_impl( - ::rocprim::flat_block_thread_id(), - input, output, storage, scan_op - ); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - BinaryFunction scan_op) + template + class block_scan_warp_scan { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - this->inclusive_scan(input, output, storage, scan_op); - // Save reduction result - reduction = storage_.warp_prefixes[warps_no_ - 1]; - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, reduction, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - const auto flat_tid = ::rocprim::flat_block_thread_id(); - const auto warp_id = ::rocprim::warp_id(); - storage_type_& storage_ = storage.get(); - this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op); - // Include block prefix (this operation overwrites storage_.warp_prefixes[warps_no_ - 1]) - T block_prefix = this->get_block_prefix( - flat_tid, warp_id, - storage_.warp_prefixes[warps_no_ - 1], // block reduction - prefix_callback_op, storage - ); - output = scan_op(block_prefix, output); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - BinaryFunction scan_op) - { - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + // Select warp size + static constexpr unsigned int warp_size_ + = detail::get_min_warp_size(BlockSize, ::rocprim::warp_size()); + // Number of warps in block + static constexpr unsigned int warps_no_ = (BlockSize + warp_size_ - 1) / warp_size_; + + // typedef of warp_scan primitive that will be used to perform warp-level + // inclusive/exclusive scan operations on input values. + // warp_scan_crosslane is an implementation of warp_scan that does not need storage, + // but requires logical warp size to be a power of two. + using warp_scan_input_type = ::rocprim::detail::warp_scan_crosslane; + // typedef of warp_scan primitive that will be used to get prefix values for + // each warp (scanned carry-outs from warps before it). + using warp_scan_prefix_type + = ::rocprim::detail::warp_scan_crosslane; + + struct storage_type_ { - thread_input = scan_op(thread_input, input[i]); + T warp_prefixes[warps_no_]; + // ---------- Shared memory optimisation ---------- + // Since warp_scan_input and warp_scan_prefix are typedef of warp_scan_crosslane, + // we don't need to allocate any temporary memory for them. + // If we just use warp_scan, we would need to add following union to this struct: + // union + // { + // typename warp_scan_input::storage_type wscan[warps_no_]; + // typename warp_scan_prefix::storage_type wprefix_scan; + // }; + // and use storage_.wscan[warp_id] and storage.wprefix_scan when calling + // warp_scan_input().inclusive_scan(..) and warp_scan_prefix().inclusive_scan(..). + }; + + public: + using storage_type = detail::raw_storage; + + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) + { + this->inclusive_scan_impl( + ::rocprim::flat_block_thread_id(), input, output, storage, scan_op); } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - storage, - scan_op - ); - - // Include prefix (first thread does not have prefix) - output[0] = input[0]; - if(flat_tid != 0) + template + ROCPRIM_DEVICE inline void inclusive_scan(T input, T& output, BinaryFunction scan_op) { - output[0] = scan_op(thread_input, input[0]); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, storage, scan_op); } - // Final thread-local scan - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void inclusive_scan( + T input, T& output, T& reduction, storage_type& storage, BinaryFunction scan_op) { - output[i] = scan_op(output[i-1], input[i]); + storage_type_& storage_ = storage.get(); + this->inclusive_scan(input, output, storage, scan_op); + // Save reduction result + reduction = storage_.warp_prefixes[warps_no_ - 1]; } - } - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - this->inclusive_scan(input, output, storage, scan_op); - // Save reduction result - reduction = storage_.warp_prefixes[warps_no_ - 1]; - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->inclusive_scan(input, output, reduction, storage, scan_op); - } - - template< - class PrefixCallback, - unsigned int ItemsPerThread, - class BinaryFunction - > - ROCPRIM_DEVICE inline - void inclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, T& reduction, BinaryFunction scan_op) { - thread_input = scan_op(thread_input, input[i]); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, reduction, storage, scan_op); } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - storage, - scan_op - ); - - // this operation overwrites storage_.warp_prefixes[warps_no_ - 1] - T block_prefix = this->get_block_prefix( - flat_tid, ::rocprim::warp_id(), - storage_.warp_prefixes[warps_no_ - 1], // block reduction - prefix_callback_op, storage - ); - - // Include prefix (first thread does not have prefix) - output[0] = input[0]; - if(flat_tid != 0) + template + ROCPRIM_DEVICE inline void inclusive_scan(T input, + T& output, + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { - output[0] = scan_op(thread_input, input[0]); + const auto flat_tid = ::rocprim::flat_block_thread_id(); + const auto warp_id = ::rocprim::warp_id(); + storage_type_& storage_ = storage.get(); + this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op); + // Include block prefix (this operation overwrites storage_.warp_prefixes[warps_no_ - 1]) + T block_prefix + = this->get_block_prefix(flat_tid, + warp_id, + storage_.warp_prefixes[warps_no_ - 1], // block reduction + prefix_callback_op, + storage); + output = scan_op(block_prefix, output); } - // Include block prefix - output[0] = scan_op(block_prefix, output[0]); - // Final thread-local scan - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + BinaryFunction scan_op) { - output[i] = scan_op(output[i-1], input[i]); + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + thread_input = scan_op(thread_input, input[i]); + } + + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + storage, + scan_op); + + // Include prefix (first thread does not have prefix) + output[0] = input[0]; + if(flat_tid != 0) + { + output[0] = scan_op(thread_input, input[0]); + } +// Final thread-local scan +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + output[i] = scan_op(output[i - 1], input[i]); + } } - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - storage_type& storage, - BinaryFunction scan_op) - { - this->exclusive_scan_impl( - ::rocprim::flat_block_thread_id(), - input, output, init, storage, scan_op - ); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan( - input, output, init, storage, scan_op - ); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - this->exclusive_scan( - input, output, init, storage, scan_op - ); - // Save reduction result - reduction = storage_.warp_prefixes[warps_no_ - 1]; - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan( - input, output, init, reduction, storage, scan_op - ); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - const auto flat_tid = ::rocprim::flat_block_thread_id(); - const auto warp_id = ::rocprim::warp_id(); - storage_type_& storage_ = storage.get(); - this->exclusive_scan_impl( - flat_tid, input, output, storage, scan_op - ); - // Include block prefix (this operation overwrites storage_.warp_prefixes[warps_no_ - 1]) - T block_prefix = this->get_block_prefix( - flat_tid, warp_id, - storage_.warp_prefixes[warps_no_ - 1], // block reduction - prefix_callback_op, storage - ); - output = scan_op(block_prefix, output); - if(flat_tid == 0) output = block_prefix; - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - storage_type& storage, - BinaryFunction scan_op) - { - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + BinaryFunction scan_op) { - thread_input = scan_op(thread_input, input[i]); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, storage, scan_op); } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - init, - storage, - scan_op - ); - - // Include init value - T prev = input[0]; - T exclusive = init; - if(flat_tid != 0) + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& reduction, + storage_type& storage, + BinaryFunction scan_op) { - exclusive = thread_input; + storage_type_& storage_ = storage.get(); + this->inclusive_scan(input, output, storage, scan_op); + // Save reduction result + reduction = storage_.warp_prefixes[warps_no_ - 1]; } - output[0] = exclusive; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& reduction, + BinaryFunction scan_op) { - exclusive = scan_op(exclusive, prev); - prev = input[i]; - output[i] = exclusive; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->inclusive_scan(input, output, reduction, storage, scan_op); } - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan(input, output, init, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - this->exclusive_scan(input, output, init, storage, scan_op); - // Save reduction result - reduction = storage_.warp_prefixes[warps_no_ - 1]; - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T init, - T& reduction, - BinaryFunction scan_op) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->exclusive_scan(input, output, init, reduction, storage, scan_op); - } - - template< - class PrefixCallback, - unsigned int ItemsPerThread, - class BinaryFunction - > - ROCPRIM_DEVICE inline - void exclusive_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Reduce thread items - T thread_input = input[0]; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void inclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { - thread_input = scan_op(thread_input, input[i]); + storage_type_& storage_ = storage.get(); + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + thread_input = scan_op(thread_input, input[i]); + } + + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + storage, + scan_op); + + // this operation overwrites storage_.warp_prefixes[warps_no_ - 1] + T block_prefix + = this->get_block_prefix(flat_tid, + ::rocprim::warp_id(), + storage_.warp_prefixes[warps_no_ - 1], // block reduction + prefix_callback_op, + storage); + + // Include prefix (first thread does not have prefix) + output[0] = input[0]; + if(flat_tid != 0) + { + output[0] = scan_op(thread_input, input[0]); + } + // Include block prefix + output[0] = scan_op(block_prefix, output[0]); +// Final thread-local scan +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + output[i] = scan_op(output[i - 1], input[i]); + } } - // Scan of reduced values to get prefixes - const auto flat_tid = ::rocprim::flat_block_thread_id(); - this->exclusive_scan_impl( - flat_tid, - thread_input, thread_input, // input, output - storage, - scan_op - ); - - // this operation overwrites storage_.warp_prefixes[warps_no_ - 1] - T block_prefix = this->get_block_prefix( - flat_tid, ::rocprim::warp_id(), - storage_.warp_prefixes[warps_no_ - 1], // block reduction - prefix_callback_op, storage - ); - - // Include init value and block prefix - T prev = input[0]; - T exclusive = block_prefix; - if(flat_tid != 0) + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, storage_type& storage, BinaryFunction scan_op) { - exclusive = scan_op(block_prefix, thread_input); + this->exclusive_scan_impl( + ::rocprim::flat_block_thread_id(), input, output, init, storage, scan_op); } - output[0] = exclusive; - #pragma unroll - for(unsigned int i = 1; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, BinaryFunction scan_op) { - exclusive = scan_op(exclusive, prev); - prev = input[i]; - output[i] = exclusive; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, storage, scan_op); } - } - -private: - template - ROCPRIM_DEVICE inline - auto inclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if<(BlockSize_ > ::rocprim::warp_size())>::type - { - storage_type_& storage_ = storage.get(); - // Perform warp scan - warp_scan_input_type().inclusive_scan( - // not using shared mem, see note in storage_type - input, output, scan_op - ); - // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] - const auto warp_id = ::rocprim::warp_id(); - this->calculate_warp_prefixes(flat_tid, warp_id, output, storage, scan_op); + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, T& reduction, storage_type& storage, BinaryFunction scan_op) + { + storage_type_& storage_ = storage.get(); + this->exclusive_scan(input, output, init, storage, scan_op); + // Save reduction result + reduction = storage_.warp_prefixes[warps_no_ - 1]; + } - // Use warp prefix to calculate the final scan results for every thread - if(warp_id != 0) + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, T& reduction, BinaryFunction scan_op) { - auto warp_prefix = storage_.warp_prefixes[warp_id - 1]; - output = scan_op(warp_prefix, output); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, reduction, storage, scan_op); } - } - - // When BlockSize is less than warp_size we dont need the extra prefix calculations. - template - ROCPRIM_DEVICE inline - auto inclusive_scan_impl(unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if ::rocprim::warp_size())>::type - { - (void) storage; - (void) flat_tid; - storage_type_& storage_ = storage.get(); - // Perform warp scan - warp_scan_input_type().inclusive_scan( - // not using shared mem, see note in storage_type - input, output, scan_op - ); - - if(flat_tid == BlockSize_ - 1) + + template + ROCPRIM_DEVICE inline void exclusive_scan(T input, + T& output, + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { - storage_.warp_prefixes[0] = output; + const auto flat_tid = ::rocprim::flat_block_thread_id(); + const auto warp_id = ::rocprim::warp_id(); + storage_type_& storage_ = storage.get(); + this->exclusive_scan_impl(flat_tid, input, output, storage, scan_op); + // Include block prefix (this operation overwrites storage_.warp_prefixes[warps_no_ - 1]) + T block_prefix + = this->get_block_prefix(flat_tid, + warp_id, + storage_.warp_prefixes[warps_no_ - 1], // block reduction + prefix_callback_op, + storage); + output = scan_op(block_prefix, output); + if(flat_tid == 0) + output = block_prefix; } - ::rocprim::syncthreads(); - } - - // Exclusive scan with initial value when BlockSize is bigger than warp_size - template - ROCPRIM_DEVICE inline - auto exclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - T init, - storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if<(BlockSize_ > ::rocprim::warp_size())>::type - { - storage_type_& storage_ = storage.get(); - // Perform warp scan on input values - warp_scan_input_type().inclusive_scan( - // not using shared mem, see note in storage_type - input, output, scan_op - ); - // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] - const auto warp_id = ::rocprim::warp_id(); - this->calculate_warp_prefixes(flat_tid, warp_id, output, storage, scan_op); + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + storage_type& storage, + BinaryFunction scan_op) + { + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + thread_input = scan_op(thread_input, input[i]); + } + + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + init, + storage, + scan_op); + + // Include init value + T prev = input[0]; + T exclusive = init; + if(flat_tid != 0) + { + exclusive = thread_input; + } + output[0] = exclusive; + +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + exclusive = scan_op(exclusive, prev); + prev = input[i]; + output[i] = exclusive; + } + } - // Include initial value in warp prefixes, and fix warp prefixes - // for exclusive scan (first warp prefix is init) - auto warp_prefix = init; - if(warp_id != 0) + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + BinaryFunction scan_op) { - warp_prefix = scan_op(init, storage_.warp_prefixes[warp_id-1]); + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, storage, scan_op); } - // Use warp prefix to calculate the final scan results for every thread - output = scan_op(warp_prefix, output); // include warp prefix in scan results - output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results - if(::rocprim::lane_id() == 0) + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op) { - output = warp_prefix; + storage_type_& storage_ = storage.get(); + this->exclusive_scan(input, output, init, storage, scan_op); + // Save reduction result + reduction = storage_.warp_prefixes[warps_no_ - 1]; } - } - - // Exclusive scan with initial value when BlockSize is less than warp_size. - // When BlockSize is less than warp_size we dont need the extra prefix calculations. - template - ROCPRIM_DEVICE inline - auto exclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - T init, - storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if ::rocprim::warp_size())>::type - { - (void) flat_tid; - (void) storage; - (void) init; - storage_type_& storage_ = storage.get(); - // Perform warp scan on input values - warp_scan_input_type().inclusive_scan( - // not using shared mem, see note in storage_type - input, output, scan_op - ); - - if(flat_tid == BlockSize_ - 1) + + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T init, + T& reduction, + BinaryFunction scan_op) { - storage_.warp_prefixes[0] = output; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->exclusive_scan(input, output, init, reduction, storage, scan_op); } - ::rocprim::syncthreads(); - // Use warp prefix to calculate the final scan results for every thread - output = scan_op(init, output); // include warp prefix in scan results - output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results - if(::rocprim::lane_id() == 0) + template + ROCPRIM_DEVICE inline void exclusive_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) { - output = init; + storage_type_& storage_ = storage.get(); + // Reduce thread items + T thread_input = input[0]; +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + thread_input = scan_op(thread_input, input[i]); + } + + // Scan of reduced values to get prefixes + const auto flat_tid = ::rocprim::flat_block_thread_id(); + this->exclusive_scan_impl(flat_tid, + thread_input, + thread_input, // input, output + storage, + scan_op); + + // this operation overwrites storage_.warp_prefixes[warps_no_ - 1] + T block_prefix + = this->get_block_prefix(flat_tid, + ::rocprim::warp_id(), + storage_.warp_prefixes[warps_no_ - 1], // block reduction + prefix_callback_op, + storage); + + // Include init value and block prefix + T prev = input[0]; + T exclusive = block_prefix; + if(flat_tid != 0) + { + exclusive = scan_op(block_prefix, thread_input); + } + output[0] = exclusive; + +#pragma unroll + for(unsigned int i = 1; i < ItemsPerThread; i++) + { + exclusive = scan_op(exclusive, prev); + prev = input[i]; + output[i] = exclusive; + } } - } - - // Exclusive scan with unknown initial value - template - ROCPRIM_DEVICE inline - auto exclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if<(BlockSize_ > ::rocprim::warp_size())>::type - { - storage_type_& storage_ = storage.get(); - // Perform warp scan on input values - warp_scan_input_type().inclusive_scan( - // not using shared mem, see note in storage_type - input, output, scan_op - ); - // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] - const auto warp_id = ::rocprim::warp_id(); - this->calculate_warp_prefixes(flat_tid, warp_id, output, storage, scan_op); + private: + template + ROCPRIM_DEVICE inline auto inclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if<(BlockSize_ > ::rocprim::warp_size())>::type + { + storage_type_& storage_ = storage.get(); + // Perform warp scan + warp_scan_input_type().inclusive_scan( + // not using shared mem, see note in storage_type + input, + output, + scan_op); + + // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] + const auto warp_id = ::rocprim::warp_id(); + this->calculate_warp_prefixes(flat_tid, warp_id, output, storage, scan_op); + + // Use warp prefix to calculate the final scan results for every thread + if(warp_id != 0) + { + auto warp_prefix = storage_.warp_prefixes[warp_id - 1]; + output = scan_op(warp_prefix, output); + } + } - // Use warp prefix to calculate the final scan results for every thread - T warp_prefix; - if(warp_id != 0) + // When BlockSize is less than warp_size we dont need the extra prefix calculations. + template + ROCPRIM_DEVICE inline auto inclusive_scan_impl(unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if ::rocprim::warp_size())>::type { - warp_prefix = storage_.warp_prefixes[warp_id - 1]; - output = scan_op(warp_prefix, output); + (void)storage; + (void)flat_tid; + storage_type_& storage_ = storage.get(); + // Perform warp scan + warp_scan_input_type().inclusive_scan( + // not using shared mem, see note in storage_type + input, + output, + scan_op); + + if(flat_tid == BlockSize_ - 1) + { + storage_.warp_prefixes[0] = output; + } + ::rocprim::syncthreads(); } - output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results - if(::rocprim::lane_id() == 0) + + // Exclusive scan with initial value when BlockSize is bigger than warp_size + template + ROCPRIM_DEVICE inline auto exclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + T init, + storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if<(BlockSize_ > ::rocprim::warp_size())>::type { - output = warp_prefix; + storage_type_& storage_ = storage.get(); + // Perform warp scan on input values + warp_scan_input_type().inclusive_scan( + // not using shared mem, see note in storage_type + input, + output, + scan_op); + + // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] + const auto warp_id = ::rocprim::warp_id(); + this->calculate_warp_prefixes(flat_tid, warp_id, output, storage, scan_op); + + // Include initial value in warp prefixes, and fix warp prefixes + // for exclusive scan (first warp prefix is init) + auto warp_prefix = init; + if(warp_id != 0) + { + warp_prefix = scan_op(init, storage_.warp_prefixes[warp_id - 1]); + } + + // Use warp prefix to calculate the final scan results for every thread + output = scan_op(warp_prefix, output); // include warp prefix in scan results + output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results + if(::rocprim::lane_id() == 0) + { + output = warp_prefix; + } } - } - - // Exclusive scan with unknown initial value, when BlockSize less than warp_size. - // When BlockSize is less than warp_size we dont need the extra prefix calculations. - template - ROCPRIM_DEVICE inline - auto exclusive_scan_impl(const unsigned int flat_tid, - T input, - T& output, - storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if ::rocprim::warp_size())>::type - { - (void) flat_tid; - (void) storage; - storage_type_& storage_ = storage.get(); - // Perform warp scan on input values - warp_scan_input_type().inclusive_scan( - // not using shared mem, see note in storage_type - input, output, scan_op - ); - - if(flat_tid == BlockSize_ - 1) + + // Exclusive scan with initial value when BlockSize is less than warp_size. + // When BlockSize is less than warp_size we dont need the extra prefix calculations. + template + ROCPRIM_DEVICE inline auto exclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + T init, + storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if ::rocprim::warp_size())>::type { - storage_.warp_prefixes[0] = output; + (void)flat_tid; + (void)storage; + (void)init; + storage_type_& storage_ = storage.get(); + // Perform warp scan on input values + warp_scan_input_type().inclusive_scan( + // not using shared mem, see note in storage_type + input, + output, + scan_op); + + if(flat_tid == BlockSize_ - 1) + { + storage_.warp_prefixes[0] = output; + } + ::rocprim::syncthreads(); + + // Use warp prefix to calculate the final scan results for every thread + output = scan_op(init, output); // include warp prefix in scan results + output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results + if(::rocprim::lane_id() == 0) + { + output = init; + } } - ::rocprim::syncthreads(); - output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results - } - - // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] - template - ROCPRIM_DEVICE inline - void calculate_warp_prefixes(const unsigned int flat_tid, - const unsigned int warp_id, - T inclusive_input, - storage_type& storage, - BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - // Save the warp reduction result, that is the scan result - // for last element in each warp - if(flat_tid == ::rocprim::min((warp_id+1) * warp_size_, BlockSize) - 1) + + // Exclusive scan with unknown initial value + template + ROCPRIM_DEVICE inline auto exclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if<(BlockSize_ > ::rocprim::warp_size())>::type { - storage_.warp_prefixes[warp_id] = inclusive_input; + storage_type_& storage_ = storage.get(); + // Perform warp scan on input values + warp_scan_input_type().inclusive_scan( + // not using shared mem, see note in storage_type + input, + output, + scan_op); + + // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] + const auto warp_id = ::rocprim::warp_id(); + this->calculate_warp_prefixes(flat_tid, warp_id, output, storage, scan_op); + + // Use warp prefix to calculate the final scan results for every thread + T warp_prefix; + if(warp_id != 0) + { + warp_prefix = storage_.warp_prefixes[warp_id - 1]; + output = scan_op(warp_prefix, output); + } + output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results + if(::rocprim::lane_id() == 0) + { + output = warp_prefix; + } } - ::rocprim::syncthreads(); - // Scan the warp reduction results and store in storage_.warp_prefixes - if(flat_tid < warps_no_) + // Exclusive scan with unknown initial value, when BlockSize less than warp_size. + // When BlockSize is less than warp_size we dont need the extra prefix calculations. + template + ROCPRIM_DEVICE inline auto exclusive_scan_impl(const unsigned int flat_tid, + T input, + T& output, + storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if ::rocprim::warp_size())>::type { - auto warp_prefix = storage_.warp_prefixes[flat_tid]; - warp_scan_prefix_type().inclusive_scan( + (void)flat_tid; + (void)storage; + storage_type_& storage_ = storage.get(); + // Perform warp scan on input values + warp_scan_input_type().inclusive_scan( // not using shared mem, see note in storage_type - warp_prefix, warp_prefix, scan_op - ); - storage_.warp_prefixes[flat_tid] = warp_prefix; + input, + output, + scan_op); + + if(flat_tid == BlockSize_ - 1) + { + storage_.warp_prefixes[0] = output; + } + ::rocprim::syncthreads(); + output = warp_shuffle_up(output, 1, warp_size_); // shift to get exclusive results } - ::rocprim::syncthreads(); - } - - // THIS OVERWRITES storage_.warp_prefixes[warps_no_ - 1] - template - ROCPRIM_DEVICE inline - T get_block_prefix(const unsigned int flat_tid, - const unsigned int warp_id, - const T reduction, - PrefixCallback& prefix_callback_op, - storage_type& storage) - { - storage_type_& storage_ = storage.get(); - if(warp_id == 0) + + // i-th warp will have its prefix stored in storage_.warp_prefixes[i-1] + template + ROCPRIM_DEVICE inline void calculate_warp_prefixes(const unsigned int flat_tid, + const unsigned int warp_id, + T inclusive_input, + storage_type& storage, + BinaryFunction scan_op) { - T block_prefix = prefix_callback_op(reduction); - if(flat_tid == 0) + storage_type_& storage_ = storage.get(); + // Save the warp reduction result, that is the scan result + // for last element in each warp + if(flat_tid == ::rocprim::min((warp_id + 1) * warp_size_, BlockSize) - 1) + { + storage_.warp_prefixes[warp_id] = inclusive_input; + } + ::rocprim::syncthreads(); + + // Scan the warp reduction results and store in storage_.warp_prefixes + if(flat_tid < warps_no_) + { + auto warp_prefix = storage_.warp_prefixes[flat_tid]; + warp_scan_prefix_type().inclusive_scan( + // not using shared mem, see note in storage_type + warp_prefix, + warp_prefix, + scan_op); + storage_.warp_prefixes[flat_tid] = warp_prefix; + } + ::rocprim::syncthreads(); + } + + // THIS OVERWRITES storage_.warp_prefixes[warps_no_ - 1] + template + ROCPRIM_DEVICE inline T get_block_prefix(const unsigned int flat_tid, + const unsigned int warp_id, + const T reduction, + PrefixCallback& prefix_callback_op, + storage_type& storage) + { + storage_type_& storage_ = storage.get(); + if(warp_id == 0) { - // Reuse storage_.warp_prefixes[warps_no_ - 1] to store block prefix - storage_.warp_prefixes[warps_no_ - 1] = block_prefix; + T block_prefix = prefix_callback_op(reduction); + if(flat_tid == 0) + { + // Reuse storage_.warp_prefixes[warps_no_ - 1] to store block prefix + storage_.warp_prefixes[warps_no_ - 1] = block_prefix; + } } + ::rocprim::syncthreads(); + return storage_.warp_prefixes[warps_no_ - 1]; } - ::rocprim::syncthreads(); - return storage_.warp_prefixes[warps_no_ - 1]; - } -}; + }; } // end namespace detail diff --git a/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp b/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp index 717efa640..8dba6d355 100644 --- a/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp +++ b/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp @@ -26,8 +26,8 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../warp/warp_sort.hpp" @@ -36,319 +36,272 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class Key, - unsigned int BlockSize, - class Value -> -class block_sort_bitonic -{ - template - struct storage_type_ + template + class block_sort_bitonic { - KeyType key[BlockSize]; - ValueType value[BlockSize]; - }; + template + struct storage_type_ + { + KeyType key[BlockSize]; + ValueType value[BlockSize]; + }; - template - struct storage_type_ - { - KeyType key[BlockSize]; - }; + template + struct storage_type_ + { + KeyType key[BlockSize]; + }; -public: - using storage_type = detail::raw_storage>; + public: + using storage_type = detail::raw_storage>; - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - storage_type& storage, - BinaryFunction compare_function) - { - this->sort_impl( - ::rocprim::flat_block_thread_id(), - storage, compare_function, - thread_key - ); - } - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - BinaryFunction compare_function) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->sort(thread_key, storage, compare_function); - } - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - storage_type& storage, - BinaryFunction compare_function) - { - this->sort_impl( - ::rocprim::flat_block_thread_id(), - storage, compare_function, - thread_key, thread_value - ); - } - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - BinaryFunction compare_function) - { - ROCPRIM_SHARED_MEMORY storage_type storage; - this->sort(thread_key, thread_value, storage, compare_function); - } - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - storage_type& storage, - const unsigned int size, - BinaryFunction compare_function) - { - this->sort_impl( - ::rocprim::flat_block_thread_id(), size, - storage, compare_function, - thread_key - ); - } - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - storage_type& storage, - const unsigned int size, - BinaryFunction compare_function) - { - this->sort_impl( - ::rocprim::flat_block_thread_id(), size, - storage, compare_function, - thread_key, thread_value - ); - } - -private: - ROCPRIM_DEVICE inline - void copy_to_shared(Key& k, const unsigned int flat_tid, storage_type& storage) - { - storage_type_& storage_ = storage.get(); - storage_.key[flat_tid] = k; - ::rocprim::syncthreads(); - } + template + ROCPRIM_DEVICE inline void + sort(Key& thread_key, storage_type& storage, BinaryFunction compare_function) + { + this->sort_impl( + ::rocprim::flat_block_thread_id(), storage, compare_function, thread_key); + } - ROCPRIM_DEVICE inline - void copy_to_shared(Key& k, Value& v, const unsigned int flat_tid, storage_type& storage) - { - storage_type_& storage_ = storage.get(); - storage_.key[flat_tid] = k; - storage_.value[flat_tid] = v; - ::rocprim::syncthreads(); - } - - template - ROCPRIM_DEVICE inline - void swap(Key& key, - const unsigned int flat_tid, - const unsigned int next_id, - const bool dir, - storage_type& storage, - BinaryFunction compare_function) - { - storage_type_& storage_ = storage.get(); - Key next_key = storage_.key[next_id]; - bool compare = compare_function(next_key, key); - bool swap = compare ^ (next_id < flat_tid) ^ dir; - if(swap) + template + ROCPRIM_DEVICE inline void sort(Key& thread_key, BinaryFunction compare_function) { - key = next_key; + ROCPRIM_SHARED_MEMORY storage_type storage; + this->sort(thread_key, storage, compare_function); } - } - - template - ROCPRIM_DEVICE inline - void swap(Key& key, - Value& value, - const unsigned int flat_tid, - const unsigned int next_id, - const bool dir, - storage_type& storage, - BinaryFunction compare_function) - { - storage_type_& storage_ = storage.get(); - Key next_key = storage_.key[next_id]; - Value next_value = storage_.value[next_id]; - bool compare = compare_function(next_key, key); - bool swap = compare ^ (next_id < flat_tid) ^ dir; - if(swap) + + template + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + storage_type& storage, + BinaryFunction compare_function) { - key = next_key; - value = next_value; + this->sort_impl(::rocprim::flat_block_thread_id(), + storage, + compare_function, + thread_key, + thread_value); } - } - - template< - unsigned int Size, - class BinaryFunction, - class... KeyValue - > - ROCPRIM_DEVICE inline - typename std::enable_if<(Size <= ::rocprim::warp_size())>::type - sort_power_two(const unsigned int flat_tid, - storage_type& storage, - BinaryFunction compare_function, - KeyValue&... kv) - { - (void) flat_tid; - (void) storage; - - ::rocprim::warp_sort wsort; - wsort.sort(kv..., compare_function); - } - - template< - unsigned int Size, - class BinaryFunction, - class... KeyValue - > - ROCPRIM_DEVICE inline - typename std::enable_if<(Size > ::rocprim::warp_size())>::type - sort_power_two(const unsigned int flat_tid, - storage_type& storage, - BinaryFunction compare_function, - KeyValue&... kv) - { - const auto warp_id_is_even = ((flat_tid / ::rocprim::warp_size()) % 2) == 0; - ::rocprim::warp_sort wsort; - auto compare_function2 = - [compare_function, warp_id_is_even](const Key& a, const Key& b) mutable -> bool + + template + ROCPRIM_DEVICE inline void + sort(Key& thread_key, Value& thread_value, BinaryFunction compare_function) + { + ROCPRIM_SHARED_MEMORY storage_type storage; + this->sort(thread_key, thread_value, storage, compare_function); + } + + template + ROCPRIM_DEVICE inline void sort(Key& thread_key, + storage_type& storage, + const unsigned int size, + BinaryFunction compare_function) + { + this->sort_impl( + ::rocprim::flat_block_thread_id(), size, storage, compare_function, thread_key); + } + + template + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + storage_type& storage, + const unsigned int size, + BinaryFunction compare_function) + { + this->sort_impl(::rocprim::flat_block_thread_id(), + size, + storage, + compare_function, + thread_key, + thread_value); + } + + private: + ROCPRIM_DEVICE inline void + copy_to_shared(Key& k, const unsigned int flat_tid, storage_type& storage) + { + storage_type_& storage_ = storage.get(); + storage_.key[flat_tid] = k; + ::rocprim::syncthreads(); + } + + ROCPRIM_DEVICE inline void + copy_to_shared(Key& k, Value& v, const unsigned int flat_tid, storage_type& storage) + { + storage_type_& storage_ = storage.get(); + storage_.key[flat_tid] = k; + storage_.value[flat_tid] = v; + ::rocprim::syncthreads(); + } + + template + ROCPRIM_DEVICE inline void swap(Key& key, + const unsigned int flat_tid, + const unsigned int next_id, + const bool dir, + storage_type& storage, + BinaryFunction compare_function) + { + storage_type_& storage_ = storage.get(); + Key next_key = storage_.key[next_id]; + bool compare = compare_function(next_key, key); + bool swap = compare ^ (next_id < flat_tid) ^ dir; + if(swap) { + key = next_key; + } + } + + template + ROCPRIM_DEVICE inline void swap(Key& key, + Value& value, + const unsigned int flat_tid, + const unsigned int next_id, + const bool dir, + storage_type& storage, + BinaryFunction compare_function) + { + storage_type_& storage_ = storage.get(); + Key next_key = storage_.key[next_id]; + Value next_value = storage_.value[next_id]; + bool compare = compare_function(next_key, key); + bool swap = compare ^ (next_id < flat_tid) ^ dir; + if(swap) + { + key = next_key; + value = next_value; + } + } + + template + ROCPRIM_DEVICE inline typename std::enable_if<(Size <= ::rocprim::warp_size())>::type + sort_power_two(const unsigned int flat_tid, + storage_type& storage, + BinaryFunction compare_function, + KeyValue&... kv) + { + (void)flat_tid; + (void)storage; + + ::rocprim::warp_sort wsort; + wsort.sort(kv..., compare_function); + } + + template + ROCPRIM_DEVICE inline typename std::enable_if<(Size > ::rocprim::warp_size())>::type + sort_power_two(const unsigned int flat_tid, + storage_type& storage, + BinaryFunction compare_function, + KeyValue&... kv) + { + const auto warp_id_is_even = ((flat_tid / ::rocprim::warp_size()) % 2) == 0; + ::rocprim::warp_sort wsort; + auto compare_function2 + = [compare_function, warp_id_is_even](const Key& a, const Key& b) mutable -> bool { auto r = compare_function(a, b); if(warp_id_is_even) return r; return !r; }; - wsort.sort(kv..., compare_function2); + wsort.sort(kv..., compare_function2); - #pragma unroll - for(unsigned int length = ::rocprim::warp_size(); length < Size; length *= 2) - { - bool dir = (flat_tid & (length * 2)) != 0; - #pragma unroll - for(unsigned int k = length; k > 0; k /= 2) +#pragma unroll + for(unsigned int length = ::rocprim::warp_size(); length < Size; length *= 2) { - copy_to_shared(kv..., flat_tid, storage); - swap(kv..., flat_tid, flat_tid ^ k, dir, storage, compare_function); - ::rocprim::syncthreads(); + bool dir = (flat_tid & (length * 2)) != 0; +#pragma unroll + for(unsigned int k = length; k > 0; k /= 2) + { + copy_to_shared(kv..., flat_tid, storage); + swap(kv..., flat_tid, flat_tid ^ k, dir, storage, compare_function); + ::rocprim::syncthreads(); + } } } - } - - template< - unsigned int Size, - class BinaryFunction, - class... KeyValue - > - ROCPRIM_DEVICE inline - typename std::enable_if::type - sort_impl(const unsigned int flat_tid, - storage_type& storage, - BinaryFunction compare_function, - KeyValue&... kv) - { - static constexpr unsigned int PairSize = sizeof...(KeyValue); - static_assert( - PairSize < 3, - "KeyValue parameter pack can 1 or 2 elements (key, or key and value)" - ); - - sort_power_two(flat_tid, storage, compare_function, kv...); - } - - // In case BlockSize is not a power-of-two, the slower odd-even mergesort function is used - // instead of the bitonic sort function - template< - unsigned int Size, - class BinaryFunction, - class... KeyValue - > - ROCPRIM_DEVICE inline - typename std::enable_if::type - sort_impl(const unsigned int flat_tid, - storage_type& storage, - BinaryFunction compare_function, - KeyValue&... kv) - { - static constexpr unsigned int PairSize = sizeof...(KeyValue); - static_assert( - PairSize < 3, - "KeyValue parameter pack can 1 or 2 elements (key, or key and value)" - ); - copy_to_shared(kv..., flat_tid, storage); + template + ROCPRIM_DEVICE inline typename std::enable_if::type + sort_impl(const unsigned int flat_tid, + storage_type& storage, + BinaryFunction compare_function, + KeyValue&... kv) + { + static constexpr unsigned int PairSize = sizeof...(KeyValue); + static_assert(PairSize < 3, + "KeyValue parameter pack can 1 or 2 elements (key, or key and value)"); - bool is_even = (flat_tid % 2) == 0; - unsigned int odd_id = (is_even) ? ::rocprim::max(flat_tid, 1u) - 1 : ::rocprim::min(flat_tid + 1, Size - 1); - unsigned int even_id = (is_even) ? ::rocprim::min(flat_tid + 1, Size - 1) : ::rocprim::max(flat_tid, 1u) - 1; + sort_power_two(flat_tid, storage, compare_function, kv...); + } - #pragma unroll - for(unsigned int length = 0; length < Size; length++) + // In case BlockSize is not a power-of-two, the slower odd-even mergesort function is used + // instead of the bitonic sort function + template + ROCPRIM_DEVICE inline typename std::enable_if::type + sort_impl(const unsigned int flat_tid, + storage_type& storage, + BinaryFunction compare_function, + KeyValue&... kv) { - unsigned int next_id = (length % 2) == 0 ? even_id : odd_id; - swap(kv..., flat_tid, next_id, 0, storage, compare_function); - ::rocprim::syncthreads(); + static constexpr unsigned int PairSize = sizeof...(KeyValue); + static_assert(PairSize < 3, + "KeyValue parameter pack can 1 or 2 elements (key, or key and value)"); + copy_to_shared(kv..., flat_tid, storage); + + bool is_even = (flat_tid % 2) == 0; + unsigned int odd_id = (is_even) ? ::rocprim::max(flat_tid, 1u) - 1 + : ::rocprim::min(flat_tid + 1, Size - 1); + unsigned int even_id = (is_even) ? ::rocprim::min(flat_tid + 1, Size - 1) + : ::rocprim::max(flat_tid, 1u) - 1; + +#pragma unroll + for(unsigned int length = 0; length < Size; length++) + { + unsigned int next_id = (length % 2) == 0 ? even_id : odd_id; + swap(kv..., flat_tid, next_id, 0, storage, compare_function); + ::rocprim::syncthreads(); + copy_to_shared(kv..., flat_tid, storage); + } } - } - - template< - class BinaryFunction, - class... KeyValue - > - ROCPRIM_DEVICE inline - void sort_impl(const unsigned int flat_tid, - const unsigned int size, - storage_type& storage, - BinaryFunction compare_function, - KeyValue&... kv) - { - static constexpr unsigned int PairSize = sizeof...(KeyValue); - static_assert( - PairSize < 3, - "KeyValue parameter pack can 1 or 2 elements (key, or key and value)" - ); - if(size > BlockSize) + template + ROCPRIM_DEVICE inline void sort_impl(const unsigned int flat_tid, + const unsigned int size, + storage_type& storage, + BinaryFunction compare_function, + KeyValue&... kv) { - return; - } + static constexpr unsigned int PairSize = sizeof...(KeyValue); + static_assert(PairSize < 3, + "KeyValue parameter pack can 1 or 2 elements (key, or key and value)"); - copy_to_shared(kv..., flat_tid, storage); + if(size > BlockSize) + { + return; + } - bool is_even = (flat_tid % 2 == 0); - unsigned int odd_id = (is_even) ? ::rocprim::max(flat_tid, 1u) - 1 : ::rocprim::min(flat_tid + 1, size - 1); - unsigned int even_id = (is_even) ? ::rocprim::min(flat_tid + 1, size - 1) : ::rocprim::max(flat_tid, 1u) - 1; + copy_to_shared(kv..., flat_tid, storage); - for(unsigned int length = 0; length < size; length++) - { - unsigned int next_id = (length % 2 == 0) ? even_id : odd_id; - // Use only "valid" keys to ensure that compare_function will not use garbage keys - // for example, as indices of an array (a lookup table) - if(flat_tid < size) + bool is_even = (flat_tid % 2 == 0); + unsigned int odd_id = (is_even) ? ::rocprim::max(flat_tid, 1u) - 1 + : ::rocprim::min(flat_tid + 1, size - 1); + unsigned int even_id = (is_even) ? ::rocprim::min(flat_tid + 1, size - 1) + : ::rocprim::max(flat_tid, 1u) - 1; + + for(unsigned int length = 0; length < size; length++) { - swap(kv..., flat_tid, next_id, 0, storage, compare_function); + unsigned int next_id = (length % 2 == 0) ? even_id : odd_id; + // Use only "valid" keys to ensure that compare_function will not use garbage keys + // for example, as indices of an array (a lookup table) + if(flat_tid < size) + { + swap(kv..., flat_tid, next_id, 0, storage, compare_function); + } + ::rocprim::syncthreads(); + copy_to_shared(kv..., flat_tid, storage); } - ::rocprim::syncthreads(); - copy_to_shared(kv..., flat_tid, storage); } - } -}; + }; } // end namespace detail diff --git a/rocprim/include/rocprim/config.hpp b/rocprim/include/rocprim/config.hpp index e004a6acb..911fc9fb5 100644 --- a/rocprim/include/rocprim/config.hpp +++ b/rocprim/include/rocprim/config.hpp @@ -22,43 +22,43 @@ #define ROCPRIM_CONFIG_HPP_ #define BEGIN_ROCPRIM_NAMESPACE \ - namespace rocprim { + namespace rocprim \ + { -#define END_ROCPRIM_NAMESPACE \ - } /* rocprim */ +#define END_ROCPRIM_NAMESPACE } /* rocprim */ -#include #include +#include #ifndef ROCPRIM_DEVICE - #define ROCPRIM_DEVICE __device__ - #define ROCPRIM_HOST __host__ - #define ROCPRIM_HOST_DEVICE __host__ __device__ - #define ROCPRIM_SHARED_MEMORY __shared__ +#define ROCPRIM_DEVICE __device__ +#define ROCPRIM_HOST __host__ +#define ROCPRIM_HOST_DEVICE __host__ __device__ +#define ROCPRIM_SHARED_MEMORY __shared__ #endif // TODO remove when the issue https://github.com/RadeonOpenCompute/hcc/issues/715 is fixed #ifndef ROCPRIM_DISABLE_DPP - #define ROCPRIM_DISABLE_DPP +#define ROCPRIM_DISABLE_DPP #endif #ifdef ROCPRIM_DISABLE_DPP - #define ROCPRIM_DETAIL_USE_DPP false +#define ROCPRIM_DETAIL_USE_DPP false #else - #define ROCPRIM_DETAIL_USE_DPP true +#define ROCPRIM_DETAIL_USE_DPP true #endif #ifdef ROCPRIM_DISABLE_LOOKBACK_SCAN - #define ROCPRIM_DETAIL_USE_LOOKBACK_SCAN false +#define ROCPRIM_DETAIL_USE_LOOKBACK_SCAN false #else - #define ROCPRIM_DETAIL_USE_LOOKBACK_SCAN true +#define ROCPRIM_DETAIL_USE_LOOKBACK_SCAN true #endif // Defines targeted AMD architecture. Supported values: // * 803 (gfx803) // * 900 (gfx900) #ifndef ROCPRIM_TARGET_ARCH - #define ROCPRIM_TARGET_ARCH 0 +#define ROCPRIM_TARGET_ARCH 0 #endif #endif // ROCPRIM_CONFIG_HPP_ diff --git a/rocprim/include/rocprim/detail/all_true.hpp b/rocprim/include/rocprim/detail/all_true.hpp index 7fac4964b..96eb648d1 100644 --- a/rocprim/include/rocprim/detail/all_true.hpp +++ b/rocprim/include/rocprim/detail/all_true.hpp @@ -29,22 +29,21 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { - -// all_of -template -struct all_true : std::true_type -{ -}; - -template -struct all_true : all_true -{ -}; - -template -struct all_true : std::false_type -{ -}; + // all_of + template + struct all_true : std::true_type + { + }; + + template + struct all_true : all_true + { + }; + + template + struct all_true : std::false_type + { + }; } // end namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/detail/binary_op_wrappers.hpp b/rocprim/include/rocprim/detail/binary_op_wrappers.hpp index 4b3def3e9..67fe5961c 100644 --- a/rocprim/include/rocprim/detail/binary_op_wrappers.hpp +++ b/rocprim/include/rocprim/detail/binary_op_wrappers.hpp @@ -24,9 +24,9 @@ #include #include "../config.hpp" +#include "../functional.hpp" #include "../intrinsics.hpp" #include "../types.hpp" -#include "../functional.hpp" #include "../detail/various.hpp" @@ -35,99 +35,87 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class BinaryFunction, - class ResultType = typename BinaryFunction::result_type, - class InputType = typename BinaryFunction::input_type -> -struct reverse_binary_op_wrapper -{ - using result_type = ResultType; - using input_type = InputType; - - ROCPRIM_HOST_DEVICE inline - reverse_binary_op_wrapper() = default; - - ROCPRIM_HOST_DEVICE inline - reverse_binary_op_wrapper(BinaryFunction binary_op) - : binary_op_(binary_op) + template + struct reverse_binary_op_wrapper { - } - - ROCPRIM_HOST_DEVICE inline - ~reverse_binary_op_wrapper() = default; + using result_type = ResultType; + using input_type = InputType; - ROCPRIM_HOST_DEVICE inline - result_type operator()(const input_type& t1, const input_type& t2) - { - return binary_op_(t2, t1); - } + ROCPRIM_HOST_DEVICE inline reverse_binary_op_wrapper() = default; -private: - BinaryFunction binary_op_; -}; + ROCPRIM_HOST_DEVICE inline reverse_binary_op_wrapper(BinaryFunction binary_op) + : binary_op_(binary_op) + { + } -// Wrapper for performing head-flagged scan -template -struct headflag_scan_op_wrapper -{ - static_assert(std::is_convertible::value, "F must be convertible to bool"); + ROCPRIM_HOST_DEVICE inline ~reverse_binary_op_wrapper() = default; - using result_type = rocprim::tuple; - using input_type = result_type; + ROCPRIM_HOST_DEVICE inline result_type operator()(const input_type& t1, + const input_type& t2) + { + return binary_op_(t2, t1); + } - ROCPRIM_HOST_DEVICE inline - headflag_scan_op_wrapper() = default; + private: + BinaryFunction binary_op_; + }; - ROCPRIM_HOST_DEVICE inline - headflag_scan_op_wrapper(BinaryFunction scan_op) - : scan_op_(scan_op) + // Wrapper for performing head-flagged scan + template + struct headflag_scan_op_wrapper { - } + static_assert(std::is_convertible::value, "F must be convertible to bool"); - ROCPRIM_HOST_DEVICE inline - ~headflag_scan_op_wrapper() = default; + using result_type = rocprim::tuple; + using input_type = result_type; - ROCPRIM_HOST_DEVICE inline - result_type operator()(const input_type& t1, const input_type& t2) - { - if(!rocprim::get<1>(t2)) + ROCPRIM_HOST_DEVICE inline headflag_scan_op_wrapper() = default; + + ROCPRIM_HOST_DEVICE inline headflag_scan_op_wrapper(BinaryFunction scan_op) + : scan_op_(scan_op) { - return rocprim::make_tuple( - scan_op_(rocprim::get<0>(t1), rocprim::get<0>(t2)), - static_cast(rocprim::get<1>(t1) || rocprim::get<1>(t2)) - ); } - return t2; - } -private: - BinaryFunction scan_op_; -}; + ROCPRIM_HOST_DEVICE inline ~headflag_scan_op_wrapper() = default; + + ROCPRIM_HOST_DEVICE inline result_type operator()(const input_type& t1, + const input_type& t2) + { + if(!rocprim::get<1>(t2)) + { + return rocprim::make_tuple( + scan_op_(rocprim::get<0>(t1), rocprim::get<0>(t2)), + static_cast(rocprim::get<1>(t1) || rocprim::get<1>(t2))); + } + return t2; + } + private: + BinaryFunction scan_op_; + }; -template -struct inequality_wrapper -{ - using equality_op_type = EqualityOp; + template + struct inequality_wrapper + { + using equality_op_type = EqualityOp; - ROCPRIM_HOST_DEVICE inline - inequality_wrapper() = default; + ROCPRIM_HOST_DEVICE inline inequality_wrapper() = default; - ROCPRIM_HOST_DEVICE inline - inequality_wrapper(equality_op_type equality_op) - : equality_op(equality_op) - {} + ROCPRIM_HOST_DEVICE inline inequality_wrapper(equality_op_type equality_op) + : equality_op(equality_op) + { + } - template - ROCPRIM_DEVICE inline - bool operator()(const T &a, const U &b) - { - return !equality_op(a, b); - } + template + ROCPRIM_DEVICE inline bool operator()(const T& a, const U& b) + { + return !equality_op(a, b); + } - equality_op_type equality_op; -}; + equality_op_type equality_op; + }; } // end of detail namespace diff --git a/rocprim/include/rocprim/detail/match_result_type.hpp b/rocprim/include/rocprim/detail/match_result_type.hpp index e143a8aac..644201373 100644 --- a/rocprim/include/rocprim/detail/match_result_type.hpp +++ b/rocprim/include/rocprim/detail/match_result_type.hpp @@ -29,81 +29,86 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// invoke_result is based on https://en.cppreference.com/w/cpp/types/result_of -// The main difference is using ROCPRIM_HOST_DEVICE, this allows to -// use invoke_result with device-only lambdas/functors in host-only functions -// on HIP-clang. - -template -struct is_reference_wrapper : std::false_type {}; -template -struct is_reference_wrapper> : std::true_type {}; - -template -struct invoke_impl { - template - ROCPRIM_HOST_DEVICE - static auto call(F&& f, Args&&... args) - -> decltype(std::forward(f)(std::forward(args)...)); -}; - -template -struct invoke_impl -{ - template::type, - class = typename std::enable_if::value>::type - > - ROCPRIM_HOST_DEVICE - static auto get(T&& t) -> T&&; - - template::type, - class = typename std::enable_if::value>::type - > - ROCPRIM_HOST_DEVICE - static auto get(T&& t) -> decltype(t.get()); - - template::type, - class = typename std::enable_if::value>::type, - class = typename std::enable_if::value>::type - > - ROCPRIM_HOST_DEVICE - static auto get(T&& t) -> decltype(*std::forward(t)); - - template::value>::type - > - ROCPRIM_HOST_DEVICE - static auto call(MT1 B::*pmf, T&& t, Args&&... args) - -> decltype((invoke_impl::get(std::forward(t)).*pmf)(std::forward(args)...)); - - template - ROCPRIM_HOST_DEVICE - static auto call(MT B::*pmd, T&& t) - -> decltype(invoke_impl::get(std::forward(t)).*pmd); -}; - -template::type> -ROCPRIM_HOST_DEVICE -auto INVOKE(F&& f, Args&&... args) - -> decltype(invoke_impl::call(std::forward(f), std::forward(args)...)); - -// Conforming C++14 implementation (is also a valid C++11 implementation): -template -struct invoke_result_impl { }; -template -struct invoke_result_impl(), std::declval()...))), F, Args...> -{ - using type = decltype(INVOKE(std::declval(), std::declval()...)); -}; - -template -struct invoke_result : invoke_result_impl {}; - -template -struct match_result_type -{ - using type = typename invoke_result::type; -}; + // invoke_result is based on https://en.cppreference.com/w/cpp/types/result_of + // The main difference is using ROCPRIM_HOST_DEVICE, this allows to + // use invoke_result with device-only lambdas/functors in host-only functions + // on HIP-clang. + + template + struct is_reference_wrapper : std::false_type + { + }; + template + struct is_reference_wrapper> : std::true_type + { + }; + + template + struct invoke_impl + { + template + ROCPRIM_HOST_DEVICE static auto call(F&& f, Args&&... args) + -> decltype(std::forward(f)(std::forward(args)...)); + }; + + template + struct invoke_impl + { + template ::type, + class = typename std::enable_if::value>::type> + ROCPRIM_HOST_DEVICE static auto get(T&& t) -> T&&; + + template ::type, + class = typename std::enable_if::value>::type> + ROCPRIM_HOST_DEVICE static auto get(T&& t) -> decltype(t.get()); + + template ::type, + class = typename std::enable_if::value>::type, + class = typename std::enable_if::value>::type> + ROCPRIM_HOST_DEVICE static auto get(T&& t) -> decltype(*std::forward(t)); + + template ::value>::type> + ROCPRIM_HOST_DEVICE static auto call(MT1 B::*pmf, T&& t, Args&&... args) + -> decltype((invoke_impl::get(std::forward(t)).*pmf)(std::forward(args)...)); + + template + ROCPRIM_HOST_DEVICE static auto call(MT B::*pmd, T&& t) + -> decltype(invoke_impl::get(std::forward(t)).*pmd); + }; + + template ::type> + ROCPRIM_HOST_DEVICE auto INVOKE(F&& f, Args&&... args) + -> decltype(invoke_impl::call(std::forward(f), std::forward(args)...)); + + // Conforming C++14 implementation (is also a valid C++11 implementation): + template + struct invoke_result_impl + { + }; + template + struct invoke_result_impl(), std::declval()...))), + F, + Args...> + { + using type = decltype(INVOKE(std::declval(), std::declval()...)); + }; + + template + struct invoke_result : invoke_result_impl + { + }; + + template + struct match_result_type + { + using type = typename invoke_result::type; + }; } // end namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/detail/radix_sort.hpp b/rocprim/include/rocprim/detail/radix_sort.hpp index f1f115679..32d8968e2 100644 --- a/rocprim/include/rocprim/detail/radix_sort.hpp +++ b/rocprim/include/rocprim/detail/radix_sort.hpp @@ -30,137 +30,142 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Encode and decode integral and floating point values for radix sort in such a way that preserves -// correct order of negative and positive keys (i.e. negative keys go before positive ones, -// which is not true for a simple reinterpetation of the key's bits). + // Encode and decode integral and floating point values for radix sort in such a way that preserves + // correct order of negative and positive keys (i.e. negative keys go before positive ones, + // which is not true for a simple reinterpetation of the key's bits). -template -struct radix_key_codec_integral { }; - -template -struct radix_key_codec_integral::value>::type> -{ - using bit_key_type = BitKey; - - ROCPRIM_DEVICE inline - static bit_key_type encode(Key key) + template + struct radix_key_codec_integral { - return *reinterpret_cast(&key); - } + }; - ROCPRIM_DEVICE inline - static Key decode(bit_key_type bit_key) + template + struct radix_key_codec_integral< + Key, + BitKey, + typename std::enable_if<::rocprim::is_unsigned::value>::type> { - return *reinterpret_cast(&bit_key); - } -}; - -template -struct radix_key_codec_integral::value>::type> -{ - using bit_key_type = BitKey; - - static constexpr bit_key_type sign_bit = bit_key_type(1) << (sizeof(bit_key_type) * 8 - 1); - - ROCPRIM_DEVICE inline - static bit_key_type encode(Key key) + using bit_key_type = BitKey; + + ROCPRIM_DEVICE inline static bit_key_type encode(Key key) + { + return *reinterpret_cast(&key); + } + + ROCPRIM_DEVICE inline static Key decode(bit_key_type bit_key) + { + return *reinterpret_cast(&bit_key); + } + }; + + template + struct radix_key_codec_integral::value>::type> { - return sign_bit ^ *reinterpret_cast(&key); - } + using bit_key_type = BitKey; - ROCPRIM_DEVICE inline - static Key decode(bit_key_type bit_key) - { - bit_key ^= sign_bit; - return *reinterpret_cast(&bit_key); - } -}; + static constexpr bit_key_type sign_bit = bit_key_type(1) << (sizeof(bit_key_type) * 8 - 1); -template -struct radix_key_codec_floating -{ - using bit_key_type = BitKey; + ROCPRIM_DEVICE inline static bit_key_type encode(Key key) + { + return sign_bit ^ *reinterpret_cast(&key); + } - static constexpr bit_key_type sign_bit = bit_key_type(1) << (sizeof(bit_key_type) * 8 - 1); + ROCPRIM_DEVICE inline static Key decode(bit_key_type bit_key) + { + bit_key ^= sign_bit; + return *reinterpret_cast(&bit_key); + } + }; - ROCPRIM_DEVICE inline - static bit_key_type encode(Key key) + template + struct radix_key_codec_floating { - bit_key_type bit_key = *reinterpret_cast(&key); - bit_key ^= (sign_bit & bit_key) == 0 ? sign_bit : bit_key_type(-1); - return bit_key; - } - - ROCPRIM_DEVICE inline - static Key decode(bit_key_type bit_key) + using bit_key_type = BitKey; + + static constexpr bit_key_type sign_bit = bit_key_type(1) << (sizeof(bit_key_type) * 8 - 1); + + ROCPRIM_DEVICE inline static bit_key_type encode(Key key) + { + bit_key_type bit_key = *reinterpret_cast(&key); + bit_key ^= (sign_bit & bit_key) == 0 ? sign_bit : bit_key_type(-1); + return bit_key; + } + + ROCPRIM_DEVICE inline static Key decode(bit_key_type bit_key) + { + bit_key ^= (sign_bit & bit_key) == 0 ? bit_key_type(-1) : sign_bit; + return *reinterpret_cast(&bit_key); + } + }; + + template + struct radix_key_codec_base { - bit_key ^= (sign_bit & bit_key) == 0 ? bit_key_type(-1) : sign_bit; - return *reinterpret_cast(&bit_key); - } -}; - -template -struct radix_key_codec_base -{ - static_assert(sizeof(Key) == 0, - "Only integral and floating point types supported as radix sort keys"); -}; - -template -struct radix_key_codec_base< - Key, - typename std::enable_if<::rocprim::is_integral::value>::type -> : radix_key_codec_integral::type> { }; - -template<> -struct radix_key_codec_base -{ - using bit_key_type = unsigned char; - - ROCPRIM_DEVICE inline - static bit_key_type encode(bool key) + static_assert(sizeof(Key) == 0, + "Only integral and floating point types supported as radix sort keys"); + }; + + template + struct radix_key_codec_base::value>::type> + : radix_key_codec_integral::type> { - return static_cast(key); - } + }; - ROCPRIM_DEVICE inline - static bool decode(bit_key_type bit_key) + template <> + struct radix_key_codec_base { - return static_cast(bit_key); - } -}; - -template<> -struct radix_key_codec_base<::rocprim::half> : radix_key_codec_floating<::rocprim::half, unsigned short> { }; - -template<> -struct radix_key_codec_base : radix_key_codec_floating { }; - -template<> -struct radix_key_codec_base : radix_key_codec_floating { }; - -template -class radix_key_codec : protected radix_key_codec_base -{ - using base_type = radix_key_codec_base; + using bit_key_type = unsigned char; + + ROCPRIM_DEVICE inline static bit_key_type encode(bool key) + { + return static_cast(key); + } + + ROCPRIM_DEVICE inline static bool decode(bit_key_type bit_key) + { + return static_cast(bit_key); + } + }; + + template <> + struct radix_key_codec_base<::rocprim::half> + : radix_key_codec_floating<::rocprim::half, unsigned short> + { + }; -public: - using bit_key_type = typename base_type::bit_key_type; + template <> + struct radix_key_codec_base : radix_key_codec_floating + { + }; - ROCPRIM_DEVICE inline - static bit_key_type encode(Key key) + template <> + struct radix_key_codec_base : radix_key_codec_floating { - bit_key_type bit_key = base_type::encode(key); - return (Descending ? ~bit_key : bit_key); - } + }; - ROCPRIM_DEVICE inline - static Key decode(bit_key_type bit_key) + template + class radix_key_codec : protected radix_key_codec_base { - bit_key = (Descending ? ~bit_key : bit_key); - return base_type::decode(bit_key); - } -}; + using base_type = radix_key_codec_base; + + public: + using bit_key_type = typename base_type::bit_key_type; + + ROCPRIM_DEVICE inline static bit_key_type encode(Key key) + { + bit_key_type bit_key = base_type::encode(key); + return (Descending ? ~bit_key : bit_key); + } + + ROCPRIM_DEVICE inline static Key decode(bit_key_type bit_key) + { + bit_key = (Descending ? ~bit_key : bit_key); + return base_type::decode(bit_key); + } + }; } // end namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/detail/various.hpp b/rocprim/include/rocprim/detail/various.hpp index 65973d37c..0ec384c24 100644 --- a/rocprim/include/rocprim/detail/various.hpp +++ b/rocprim/include/rocprim/detail/various.hpp @@ -24,8 +24,8 @@ #include #include "../config.hpp" -#include "../types.hpp" #include "../type_traits.hpp" +#include "../types.hpp" // TODO: Refactor when it gets crowded @@ -33,213 +33,184 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -struct empty_storage_type -{ + struct empty_storage_type + { + }; -}; + template + ROCPRIM_HOST_DEVICE inline constexpr bool is_power_of_two(const T x) + { + static_assert(::rocprim::is_integral::value, "T must be integer type"); + return (x > 0) && ((x & (x - 1)) == 0); + } -template -ROCPRIM_HOST_DEVICE inline -constexpr bool is_power_of_two(const T x) -{ - static_assert(::rocprim::is_integral::value, "T must be integer type"); - return (x > 0) && ((x & (x - 1)) == 0); -} + template + ROCPRIM_HOST_DEVICE inline constexpr T next_power_of_two(const T x, const T acc = 1) + { + static_assert(::rocprim::is_unsigned::value, "T must be unsigned type"); + return acc >= x ? acc : next_power_of_two(x, 2 * acc); + } -template -ROCPRIM_HOST_DEVICE inline -constexpr T next_power_of_two(const T x, const T acc = 1) -{ - static_assert(::rocprim::is_unsigned::value, "T must be unsigned type"); - return acc >= x ? acc : next_power_of_two(x, 2 * acc); -} - -template -ROCPRIM_HOST_DEVICE inline -constexpr auto ceiling_div(T a, T b) - -> typename std::enable_if<::rocprim::is_integral::value, T>::type -{ - return (a + b - 1) / b; -} + template + ROCPRIM_HOST_DEVICE inline constexpr auto ceiling_div(T a, T b) -> + typename std::enable_if<::rocprim::is_integral::value, T>::type + { + return (a + b - 1) / b; + } -ROCPRIM_HOST_DEVICE inline -size_t align_size(size_t size, size_t alignment = 256) -{ - return ceiling_div(size, alignment) * alignment; -} - -// Select the minimal warp size for block of size block_size, it's -// useful for blocks smaller than maximal warp size. -template -ROCPRIM_HOST_DEVICE inline -constexpr T get_min_warp_size(const T block_size, const T max_warp_size) -{ - static_assert(::rocprim::is_unsigned::value, "T must be unsigned type"); - return block_size >= max_warp_size ? max_warp_size : next_power_of_two(block_size); -} - -template -struct is_warpsize_shuffleable { - static const bool value = detail::is_power_of_two(WarpSize); -}; - -// Selects an appropriate vector_type based on the input T and size N. -// The byte size is calculated and used to select an appropriate vector_type. -template -struct match_vector_type -{ - static constexpr unsigned int size = sizeof(T) * N; - using vector_base_type = - typename std::conditional< + ROCPRIM_HOST_DEVICE inline size_t align_size(size_t size, size_t alignment = 256) + { + return ceiling_div(size, alignment) * alignment; + } + + // Select the minimal warp size for block of size block_size, it's + // useful for blocks smaller than maximal warp size. + template + ROCPRIM_HOST_DEVICE inline constexpr T get_min_warp_size(const T block_size, + const T max_warp_size) + { + static_assert(::rocprim::is_unsigned::value, "T must be unsigned type"); + return block_size >= max_warp_size ? max_warp_size : next_power_of_two(block_size); + } + + template + struct is_warpsize_shuffleable + { + static const bool value = detail::is_power_of_two(WarpSize); + }; + + // Selects an appropriate vector_type based on the input T and size N. + // The byte size is calculated and used to select an appropriate vector_type. + template + struct match_vector_type + { + static constexpr unsigned int size = sizeof(T) * N; + using vector_base_type = typename std::conditional< sizeof(T) >= 4, int, - typename std::conditional< - sizeof(T) >= 2, - short, - char - >::type - >::type; - - using vector_4 = typename make_vector_type::type; - using vector_2 = typename make_vector_type::type; - using vector_1 = typename make_vector_type::type; - - using type = - typename std::conditional< + typename std::conditional= 2, short, char>::type>::type; + + using vector_4 = typename make_vector_type::type; + using vector_2 = typename make_vector_type::type; + using vector_1 = typename make_vector_type::type; + + using type = typename std::conditional< size % sizeof(vector_4) == 0, vector_4, - typename std::conditional< - size % sizeof(vector_2) == 0, - vector_2, - vector_1 - >::type - >::type; -}; - -// Checks if Items is odd and ensures that size of T is smaller than vector_type. -template -ROCPRIM_HOST_DEVICE -constexpr bool is_vectorizable() -{ - return (Items % 2 == 0) && - (sizeof(T) < sizeof(typename match_vector_type::type)); -} + typename std::conditional::type>:: + type; + }; -// Returns the number of LDS (local data share) banks. -ROCPRIM_HOST_DEVICE -constexpr unsigned int get_lds_banks_no() -{ - // Currently all devices supported by ROCm have 32 banks (4 bytes each) - return 32; -} - -// Finds biggest fundamental type for type T that sizeof(T) is -// a multiple of that type's size. -template -struct match_fundamental_type -{ - using type = - typename std::conditional< - sizeof(T)%8 == 0, - unsigned long long, - typename std::conditional< - sizeof(T)%4 == 0, - unsigned int, - typename std::conditional< - sizeof(T)%2 == 0, - unsigned short, - unsigned char - >::type - >::type - >::type; -}; - -template -ROCPRIM_DEVICE inline -auto store_volatile(T * output, T value) - -> typename std::enable_if::value>::type -{ - *const_cast(output) = value; -} - -template -ROCPRIM_DEVICE inline -auto store_volatile(T * output, T value) - -> typename std::enable_if::value>::type -{ - using fundamental_type = typename match_fundamental_type::type; - constexpr unsigned int n = sizeof(T) / sizeof(fundamental_type); + // Checks if Items is odd and ensures that size of T is smaller than vector_type. + template + ROCPRIM_HOST_DEVICE constexpr bool is_vectorizable() + { + return (Items % 2 == 0) && (sizeof(T) < sizeof(typename match_vector_type::type)); + } - auto input_ptr = reinterpret_cast(&value); - auto output_ptr = reinterpret_cast(output); + // Returns the number of LDS (local data share) banks. + ROCPRIM_HOST_DEVICE + constexpr unsigned int get_lds_banks_no() + { + // Currently all devices supported by ROCm have 32 banks (4 bytes each) + return 32; + } - #pragma unroll - for(unsigned int i = 0; i < n; i++) + // Finds biggest fundamental type for type T that sizeof(T) is + // a multiple of that type's size. + template + struct match_fundamental_type { - output_ptr[i] = input_ptr[i]; + using type = typename std::conditional< + sizeof(T) % 8 == 0, + unsigned long long, + typename std::conditional::type>::type>::type; + }; + + template + ROCPRIM_DEVICE inline auto store_volatile(T* output, T value) -> + typename std::enable_if::value>::type + { + *const_cast(output) = value; } -} -template -ROCPRIM_DEVICE inline -auto load_volatile(T * input) - -> typename std::enable_if::value, T>::type -{ - T retval = *const_cast(input); - return retval; -} - -template -ROCPRIM_DEVICE inline -auto load_volatile(T * input) - -> typename std::enable_if::value, T>::type -{ - using fundamental_type = typename match_fundamental_type::type; - constexpr unsigned int n = sizeof(T) / sizeof(fundamental_type); + template + ROCPRIM_DEVICE inline auto store_volatile(T* output, T value) -> + typename std::enable_if::value>::type + { + using fundamental_type = typename match_fundamental_type::type; + constexpr unsigned int n = sizeof(T) / sizeof(fundamental_type); - T retval; - auto output_ptr = reinterpret_cast(&retval); - auto input_ptr = reinterpret_cast(input); + auto input_ptr = reinterpret_cast(&value); + auto output_ptr = reinterpret_cast(output); - #pragma unroll - for(unsigned int i = 0; i < n; i++) - { - output_ptr[i] = input_ptr[i]; +#pragma unroll + for(unsigned int i = 0; i < n; i++) + { + output_ptr[i] = input_ptr[i]; + } } - return retval; -} - -// A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions -template -struct raw_storage -{ - // Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T - typedef typename detail::match_fundamental_type::type device_word; - // Backing storage - device_word storage[sizeof(T) / sizeof(device_word)]; + template + ROCPRIM_DEVICE inline auto load_volatile(T* input) -> + typename std::enable_if::value, T>::type + { + T retval = *const_cast(input); + return retval; + } - // Alias - ROCPRIM_HOST_DEVICE T& get() + template + ROCPRIM_DEVICE inline auto load_volatile(T* input) -> + typename std::enable_if::value, T>::type { - return reinterpret_cast(*this); + using fundamental_type = typename match_fundamental_type::type; + constexpr unsigned int n = sizeof(T) / sizeof(fundamental_type); + + T retval; + auto output_ptr = reinterpret_cast(&retval); + auto input_ptr = reinterpret_cast(input); + +#pragma unroll + for(unsigned int i = 0; i < n; i++) + { + output_ptr[i] = input_ptr[i]; + } + return retval; } -}; -// Checks if two iterators have the same type and value -template -inline -bool are_iterators_equal(Iterator1, Iterator2) -{ - return false; -} + // A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions + template + struct raw_storage + { + // Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename detail::match_fundamental_type::type device_word; + + // Backing storage + device_word storage[sizeof(T) / sizeof(device_word)]; + + // Alias + ROCPRIM_HOST_DEVICE T& get() + { + return reinterpret_cast(*this); + } + }; + + // Checks if two iterators have the same type and value + template + inline bool are_iterators_equal(Iterator1, Iterator2) + { + return false; + } -template -inline -bool are_iterators_equal(Iterator iter1, Iterator iter2) -{ - return iter1 == iter2; -} + template + inline bool are_iterators_equal(Iterator iter1, Iterator iter2) + { + return iter1 == iter2; + } } // end namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp index b01c38710..485d4b104 100644 --- a/rocprim/include/rocprim/device/config_types.hpp +++ b/rocprim/include/rocprim/device/config_types.hpp @@ -34,13 +34,15 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Special type used to show that the given device-level operation /// will be executed with optimal configuration dependent on types of the function's parameters /// and the target device architecture specified by ROCPRIM_TARGET_ARCH. -struct default_config { }; +struct default_config +{ +}; /// \brief Configuration of particular kernels launched by device-level operation /// /// \tparam BlockSize - number of threads in a block. /// \tparam ItemsPerThread - number of items processed by each thread. -template +template struct kernel_config { /// \brief Number of threads in a block. @@ -52,104 +54,101 @@ struct kernel_config namespace detail { -template< - unsigned int MaxBlockSize, - unsigned int SharedMemoryPerThread, - // Most kernels require block sizes not smaller than warp - unsigned int MinBlockSize = ::rocprim::warp_size(), - // Can fit in shared memory? - // Although GPUs have 64KiB, 32KiB is used here as a "soft" limit, - // because some additional memory may be required in kernels - bool = (MaxBlockSize * SharedMemoryPerThread <= (1u << 15)) -> -struct limit_block_size -{ - // No, then try to decrease block size - static constexpr unsigned int value = - limit_block_size< - detail::next_power_of_two(MaxBlockSize) / 2, - SharedMemoryPerThread, - MinBlockSize - >::value; -}; - -template< - unsigned int MaxBlockSize, - unsigned int SharedMemoryPerThread, - unsigned int MinBlockSize -> -struct limit_block_size -{ - static_assert(MaxBlockSize >= MinBlockSize, "Data is too large, it cannot fit in shared memory"); - - static constexpr unsigned int value = MaxBlockSize; -}; - -template -using void_t = void; - -template -struct extract_type : T { }; - -template -struct extract_type > : extract_type { }; - -template -struct select_type_case -{ - static constexpr bool value = Value; - using type = T; -}; - -template -struct select_type - : std::conditional< - Case::value, - extract_type, - select_type - >::type { }; - -template -struct select_type> : extract_type { }; - -template -struct select_type> -{ - static_assert( - sizeof(T) == 0, - "Cannot select any case. " - "The last case must have true condition or be a fallback type." - ); -}; - -template -struct select_type : extract_type { }; - -template -struct select_arch_case -{ - static constexpr unsigned int arch = Arch; - using type = T; -}; - -template -struct select_arch - : std::conditional< - Case::arch == TargetArch, - extract_type, - select_arch - >::type { }; - -template -struct select_arch : extract_type { }; - -template -using default_or_custom_config = - typename std::conditional< - std::is_same::value, - Default, - Config - >::type; + template + struct limit_block_size + { + // No, then try to decrease block size + static constexpr unsigned int value + = limit_block_size::value; + }; + + template + struct limit_block_size + { + static_assert(MaxBlockSize >= MinBlockSize, + "Data is too large, it cannot fit in shared memory"); + + static constexpr unsigned int value = MaxBlockSize; + }; + + template + using void_t = void; + + template + struct extract_type : T + { + }; + + template + struct extract_type> : extract_type + { + }; + + template + struct select_type_case + { + static constexpr bool value = Value; + using type = T; + }; + + template + struct select_type : std::conditional, + select_type>::type + { + }; + + template + struct select_type> : extract_type + { + }; + + template + struct select_type> + { + static_assert(sizeof(T) == 0, + "Cannot select any case. " + "The last case must have true condition or be a fallback type."); + }; + + template + struct select_type : extract_type + { + }; + + template + struct select_arch_case + { + static constexpr unsigned int arch = Arch; + using type = T; + }; + + template + struct select_arch : std::conditional, + select_arch>::type + { + }; + + template + struct select_arch : extract_type + { + }; + + template + using default_or_custom_config = typename std:: + conditional::value, Default, Config>::type; } // end namespace detail diff --git a/rocprim/include/rocprim/device/detail/device_binary_search.hpp b/rocprim/include/rocprim/device/detail/device_binary_search.hpp index 5e4b54683..6567fe373 100644 --- a/rocprim/include/rocprim/device/detail/device_binary_search.hpp +++ b/rocprim/include/rocprim/device/detail/device_binary_search.hpp @@ -26,98 +26,101 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -ROCPRIM_DEVICE inline -Size get_binary_search_middle(Size left, Size right) -{ - // Instead of `/ 2` we use `* 33 / 64`, i.e. the middle is slightly moved. - // This greatly reduces address aliasing and hence cache misses for (nearly-)power-of-two - // sizes of haystack (when addresses are mapped to the same cache line). - // For random needles and (nearly-)power-of-two sizes, this change increases performance - // 4-20 times making it equal to performance of arbitrary sizes of haystack. - // See https://www.pvk.ca/Blog/2012/07/30/binary-search-is-a-pathological-case-for-caches/ - const Size d = right - left; - return left + d / 2 + d / 64; -} + template + ROCPRIM_DEVICE inline Size get_binary_search_middle(Size left, Size right) + { + // Instead of `/ 2` we use `* 33 / 64`, i.e. the middle is slightly moved. + // This greatly reduces address aliasing and hence cache misses for (nearly-)power-of-two + // sizes of haystack (when addresses are mapped to the same cache line). + // For random needles and (nearly-)power-of-two sizes, this change increases performance + // 4-20 times making it equal to performance of arbitrary sizes of haystack. + // See https://www.pvk.ca/Blog/2012/07/30/binary-search-is-a-pathological-case-for-caches/ + const Size d = right - left; + return left + d / 2 + d / 64; + } -template -ROCPRIM_DEVICE inline -Size lower_bound_n(RandomAccessIterator first, - Size size, - const T& value, - BinaryPredicate compare_op) -{ - Size left = 0; - Size right = size; - while(left < right) + template + ROCPRIM_DEVICE inline Size lower_bound_n(RandomAccessIterator first, + Size size, + const T& value, + BinaryPredicate compare_op) { - const Size mid = get_binary_search_middle(left, right); - if(compare_op(first[mid], value)) - { - left = mid + 1; - } - else + Size left = 0; + Size right = size; + while(left < right) { - right = mid; + const Size mid = get_binary_search_middle(left, right); + if(compare_op(first[mid], value)) + { + left = mid + 1; + } + else + { + right = mid; + } } + return left; } - return left; -} -template -ROCPRIM_DEVICE inline -Size upper_bound_n(RandomAccessIterator first, - Size size, - const T& value, - BinaryPredicate compare_op) -{ - Size left = 0; - Size right = size; - while(left < right) + template + ROCPRIM_DEVICE inline Size upper_bound_n(RandomAccessIterator first, + Size size, + const T& value, + BinaryPredicate compare_op) { - const Size mid = get_binary_search_middle(left, right); - if(compare_op(value, first[mid])) - { - right = mid; - } - else + Size left = 0; + Size right = size; + while(left < right) { - left = mid + 1; + const Size mid = get_binary_search_middle(left, right); + if(compare_op(value, first[mid])) + { + right = mid; + } + else + { + left = mid + 1; + } } + return left; } - return left; -} -struct lower_bound_search_op -{ - template - ROCPRIM_DEVICE inline - Size operator()(HaystackIterator haystack, Size size, const T& value, CompareOp compare_op) const + struct lower_bound_search_op { - return lower_bound_n(haystack, size, value, compare_op); - } -}; + template + ROCPRIM_DEVICE inline Size operator()(HaystackIterator haystack, + Size size, + const T& value, + CompareOp compare_op) const + { + return lower_bound_n(haystack, size, value, compare_op); + } + }; -struct upper_bound_search_op -{ - template - ROCPRIM_DEVICE inline - Size operator()(HaystackIterator haystack, Size size, const T& value, CompareOp compare_op) const + struct upper_bound_search_op { - return upper_bound_n(haystack, size, value, compare_op); - } -}; + template + ROCPRIM_DEVICE inline Size operator()(HaystackIterator haystack, + Size size, + const T& value, + CompareOp compare_op) const + { + return upper_bound_n(haystack, size, value, compare_op); + } + }; -struct binary_search_op -{ - template - ROCPRIM_DEVICE inline - bool operator()(HaystackIterator haystack, Size size, const T& value, CompareOp compare_op) const + struct binary_search_op { - const Size n = lower_bound_n(haystack, size, value, compare_op); - return n != size && !compare_op(value, haystack[n]); - } -}; + template + ROCPRIM_DEVICE inline bool operator()(HaystackIterator haystack, + Size size, + const T& value, + CompareOp compare_op) const + { + const Size n = lower_bound_n(haystack, size, value, compare_op); + return n != size && !compare_op(value, haystack[n]); + } + }; } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_histogram.hpp b/rocprim/include/rocprim/device/detail/device_histogram.hpp index a1d948138..a3c93415d 100644 --- a/rocprim/include/rocprim/device/detail/device_histogram.hpp +++ b/rocprim/include/rocprim/device/detail/device_histogram.hpp @@ -22,14 +22,14 @@ #define ROCPRIM_DEVICE_DETAIL_DEVICE_HISTOGRAM_HPP_ #include -#include #include +#include #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "uint_fast_div.hpp" @@ -38,511 +38,476 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Special wrapper for passing fixed-length arrays (i.e. T values[Size]) into kernels -template -class fixed_array -{ -private: - T values[Size]; + // Special wrapper for passing fixed-length arrays (i.e. T values[Size]) into kernels + template + class fixed_array + { + private: + T values[Size]; -public: + public: + ROCPRIM_HOST_DEVICE inline fixed_array(const T values[Size]) + { + for(unsigned int i = 0; i < Size; i++) + { + this->values[i] = values[i]; + } + } - ROCPRIM_HOST_DEVICE inline - fixed_array(const T values[Size]) - { - for(unsigned int i = 0; i < Size; i++) + ROCPRIM_HOST_DEVICE inline T& operator[](unsigned int index) { - this->values[i] = values[i]; + return values[index]; } - } - ROCPRIM_HOST_DEVICE inline - T& operator[](unsigned int index) - { - return values[index]; - } + ROCPRIM_HOST_DEVICE inline const T& operator[](unsigned int index) const + { + return values[index]; + } + }; - ROCPRIM_HOST_DEVICE inline - const T& operator[](unsigned int index) const + template + struct sample_to_bin_even { - return values[index]; - } -}; + unsigned int bins; + Level lower_level; + Level upper_level; + Level scale; + + ROCPRIM_HOST_DEVICE inline sample_to_bin_even() = default; + + ROCPRIM_HOST_DEVICE inline sample_to_bin_even(unsigned int bins, + Level lower_level, + Level upper_level) + : bins(bins) + , lower_level(lower_level) + , upper_level(upper_level) + , scale((upper_level - lower_level) / bins) + { + } -template -struct sample_to_bin_even -{ - unsigned int bins; - Level lower_level; - Level upper_level; - Level scale; - - ROCPRIM_HOST_DEVICE inline - sample_to_bin_even() = default; - - ROCPRIM_HOST_DEVICE inline - sample_to_bin_even(unsigned int bins, Level lower_level, Level upper_level) - : bins(bins), - lower_level(lower_level), - upper_level(upper_level), - scale((upper_level - lower_level) / bins) - {} - - template - ROCPRIM_HOST_DEVICE inline - bool operator()(Sample sample, unsigned int& bin) const - { - const Level s = static_cast(sample); - if(s >= lower_level && s < upper_level) + template + ROCPRIM_HOST_DEVICE inline bool operator()(Sample sample, unsigned int& bin) const { - bin = static_cast((s - lower_level) / scale); - return true; + const Level s = static_cast(sample); + if(s >= lower_level && s < upper_level) + { + bin = static_cast((s - lower_level) / scale); + return true; + } + return false; } - return false; - } -}; + }; -// This specialization uses fast division (uint_fast_div) for integers smaller than 64 bit -template -struct sample_to_bin_even::value && (sizeof(Level) <= 4)>::type> -{ - unsigned int bins; - Level lower_level; - Level upper_level; - uint_fast_div scale; - - ROCPRIM_HOST_DEVICE inline - sample_to_bin_even() = default; - - ROCPRIM_HOST_DEVICE inline - sample_to_bin_even(unsigned int bins, Level lower_level, Level upper_level) - : bins(bins), - lower_level(lower_level), - upper_level(upper_level), - scale((upper_level - lower_level) / bins) - {} - - template - ROCPRIM_HOST_DEVICE inline - bool operator()(Sample sample, unsigned int& bin) const + // This specialization uses fast division (uint_fast_div) for integers smaller than 64 bit + template + struct sample_to_bin_even< + Level, + typename std::enable_if::value && (sizeof(Level) <= 4)>::type> { - const Level s = static_cast(sample); - if(s >= lower_level && s < upper_level) + unsigned int bins; + Level lower_level; + Level upper_level; + uint_fast_div scale; + + ROCPRIM_HOST_DEVICE inline sample_to_bin_even() = default; + + ROCPRIM_HOST_DEVICE inline sample_to_bin_even(unsigned int bins, + Level lower_level, + Level upper_level) + : bins(bins) + , lower_level(lower_level) + , upper_level(upper_level) + , scale((upper_level - lower_level) / bins) { - bin = static_cast(s - lower_level) / scale; - return true; } - return false; - } -}; -// This specialization uses multiplication by inv divisor for floats -template -struct sample_to_bin_even::value>::type> -{ - unsigned int bins; - Level lower_level; - Level upper_level; - Level inv_scale; - - ROCPRIM_HOST_DEVICE inline - sample_to_bin_even() = default; - - ROCPRIM_HOST_DEVICE inline - sample_to_bin_even(unsigned int bins, Level lower_level, Level upper_level) - : bins(bins), - lower_level(lower_level), - upper_level(upper_level), - inv_scale(bins / (upper_level - lower_level)) - {} - - template - ROCPRIM_HOST_DEVICE inline - bool operator()(Sample sample, unsigned int& bin) const - { - const Level s = static_cast(sample); - if(s >= lower_level && s < upper_level) + template + ROCPRIM_HOST_DEVICE inline bool operator()(Sample sample, unsigned int& bin) const { - bin = static_cast((s - lower_level) * inv_scale); - return true; + const Level s = static_cast(sample); + if(s >= lower_level && s < upper_level) + { + bin = static_cast(s - lower_level) / scale; + return true; + } + return false; } - return false; - } -}; + }; -// Returns index of the first element in values that is greater than value, or count if no such element is found. -template -ROCPRIM_HOST_DEVICE inline -unsigned int upper_bound(const T * values, unsigned int count, T value) -{ - unsigned int current = 0; - while(count > 0) + // This specialization uses multiplication by inv divisor for floats + template + struct sample_to_bin_even::value>::type> { - const unsigned int step = count / 2; - const unsigned int next = current + step; - if(value < values[next]) + unsigned int bins; + Level lower_level; + Level upper_level; + Level inv_scale; + + ROCPRIM_HOST_DEVICE inline sample_to_bin_even() = default; + + ROCPRIM_HOST_DEVICE inline sample_to_bin_even(unsigned int bins, + Level lower_level, + Level upper_level) + : bins(bins) + , lower_level(lower_level) + , upper_level(upper_level) + , inv_scale(bins / (upper_level - lower_level)) { - count = step; } - else + + template + ROCPRIM_HOST_DEVICE inline bool operator()(Sample sample, unsigned int& bin) const { - current = next + 1; - count -= step + 1; + const Level s = static_cast(sample); + if(s >= lower_level && s < upper_level) + { + bin = static_cast((s - lower_level) * inv_scale); + return true; + } + return false; } - } - return current; -} + }; -template -struct sample_to_bin_range -{ - unsigned int bins; - const Level * level_values; + // Returns index of the first element in values that is greater than value, or count if no such element is found. + template + ROCPRIM_HOST_DEVICE inline unsigned int + upper_bound(const T* values, unsigned int count, T value) + { + unsigned int current = 0; + while(count > 0) + { + const unsigned int step = count / 2; + const unsigned int next = current + step; + if(value < values[next]) + { + count = step; + } + else + { + current = next + 1; + count -= step + 1; + } + } + return current; + } - ROCPRIM_HOST_DEVICE inline - sample_to_bin_range() = default; + template + struct sample_to_bin_range + { + unsigned int bins; + const Level* level_values; - ROCPRIM_HOST_DEVICE inline - sample_to_bin_range(unsigned int bins, const Level * level_values) - : bins(bins), level_values(level_values) - {} + ROCPRIM_HOST_DEVICE inline sample_to_bin_range() = default; - template - ROCPRIM_HOST_DEVICE inline - bool operator()(Sample sample, unsigned int& bin) const - { - const Level s = static_cast(sample); - bin = upper_bound(level_values, bins + 1, s) - 1; - return bin < bins; - } -}; + ROCPRIM_HOST_DEVICE inline sample_to_bin_range(unsigned int bins, const Level* level_values) + : bins(bins) + , level_values(level_values) + { + } -template -struct sample_vector -{ - T values[Size]; -}; - -// Checks if it is possible to load 2 or 4 sample_vector as one 32-bit value -template< - unsigned int ItemsPerThread, - unsigned int Channels, - class Sample -> -struct is_sample_vectorizable - : std::integral_constant< - bool, - ((sizeof(Sample) * Channels == 1) || (sizeof(Sample) * Channels == 2)) && - (sizeof(Sample) * Channels * ItemsPerThread % sizeof(int) == 0) && - (sizeof(Sample) * Channels * ItemsPerThread / sizeof(int) > 0) - > { }; - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - class Sample -> -ROCPRIM_DEVICE inline -typename std::enable_if::value>::type -load_samples(unsigned int flat_id, - Sample * samples, - sample_vector (&values)[ItemsPerThread]) -{ - using packed_samples_type = int[sizeof(Sample) * Channels * ItemsPerThread / sizeof(int)]; + template + ROCPRIM_HOST_DEVICE inline bool operator()(Sample sample, unsigned int& bin) const + { + const Level s = static_cast(sample); + bin = upper_bound(level_values, bins + 1, s) - 1; + return bin < bins; + } + }; - if(reinterpret_cast(samples) % sizeof(int) == 0) + template + struct sample_vector { - // the pointer is aligned by 4 bytes - block_load_direct_striped( - flat_id, - reinterpret_cast(samples), - reinterpret_cast(values) - ); + T values[Size]; + }; + + // Checks if it is possible to load 2 or 4 sample_vector as one 32-bit value + template + struct is_sample_vectorizable + : std::integral_constant< + bool, + ((sizeof(Sample) * Channels == 1) || (sizeof(Sample) * Channels == 2)) + && (sizeof(Sample) * Channels * ItemsPerThread % sizeof(int) == 0) + && (sizeof(Sample) * Channels * ItemsPerThread / sizeof(int) > 0)> + { + }; + + template + ROCPRIM_DEVICE inline typename std::enable_if< + is_sample_vectorizable::value>::type + load_samples(unsigned int flat_id, + Sample* samples, + sample_vector (&values)[ItemsPerThread]) + { + using packed_samples_type = int[sizeof(Sample) * Channels * ItemsPerThread / sizeof(int)]; + + if(reinterpret_cast(samples) % sizeof(int) == 0) + { + // the pointer is aligned by 4 bytes + block_load_direct_striped(flat_id, + reinterpret_cast(samples), + reinterpret_cast(values)); + } + else + { + block_load_direct_striped( + flat_id, reinterpret_cast*>(samples), values); + } } - else + + template + ROCPRIM_DEVICE inline typename std::enable_if< + !is_sample_vectorizable::value>::type + load_samples(unsigned int flat_id, + Sample* samples, + sample_vector (&values)[ItemsPerThread]) { block_load_direct_striped( - flat_id, - reinterpret_cast *>(samples), - values - ); + flat_id, reinterpret_cast*>(samples), values); } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - class Sample -> -ROCPRIM_DEVICE inline -typename std::enable_if::value>::type -load_samples(unsigned int flat_id, - Sample * samples, - sample_vector (&values)[ItemsPerThread]) -{ - block_load_direct_striped( - flat_id, - reinterpret_cast *>(samples), - values - ); -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - class Sample, - class SampleIterator -> -ROCPRIM_DEVICE inline -void load_samples(unsigned int flat_id, - SampleIterator samples, - sample_vector (&values)[ItemsPerThread]) -{ - Sample tmp[Channels * ItemsPerThread]; - block_load_direct_blocked( - flat_id, - samples, - tmp - ); - for(unsigned int i = 0; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void + load_samples(unsigned int flat_id, + SampleIterator samples, + sample_vector (&values)[ItemsPerThread]) { - for(unsigned int channel = 0; channel < Channels; channel++) + Sample tmp[Channels * ItemsPerThread]; + block_load_direct_blocked(flat_id, samples, tmp); + for(unsigned int i = 0; i < ItemsPerThread; i++) { - values[i].values[channel] = tmp[i * Channels + channel]; + for(unsigned int channel = 0; channel < Channels; channel++) + { + values[i].values[channel] = tmp[i * Channels + channel]; + } } } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - class Sample, - class SampleIterator -> -ROCPRIM_DEVICE inline -void load_samples(unsigned int flat_id, - SampleIterator samples, - sample_vector (&values)[ItemsPerThread], - unsigned int valid_count) -{ - Sample tmp[Channels * ItemsPerThread]; - block_load_direct_blocked( - flat_id, - samples, - tmp, - valid_count * Channels - ); - for(unsigned int i = 0; i < ItemsPerThread; i++) + + template + ROCPRIM_DEVICE inline void + load_samples(unsigned int flat_id, + SampleIterator samples, + sample_vector (&values)[ItemsPerThread], + unsigned int valid_count) { - for(unsigned int channel = 0; channel < Channels; channel++) + Sample tmp[Channels * ItemsPerThread]; + block_load_direct_blocked(flat_id, samples, tmp, valid_count * Channels); + for(unsigned int i = 0; i < ItemsPerThread; i++) { - values[i].values[channel] = tmp[i * Channels + channel]; + for(unsigned int channel = 0; channel < Channels; channel++) + { + values[i].values[channel] = tmp[i * Channels + channel]; + } } } -} - -template< - unsigned int BlockSize, - unsigned int ActiveChannels, - class Counter -> -ROCPRIM_DEVICE inline -void init_histogram(fixed_array histogram, - fixed_array bins) -{ - const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int block_id = ::rocprim::detail::block_id<0>(); - const unsigned int index = block_id * BlockSize + flat_id; - for(unsigned int channel = 0; channel < ActiveChannels; channel++) + template + ROCPRIM_DEVICE inline void init_histogram(fixed_array histogram, + fixed_array bins) { - if(index < bins[channel]) + const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int block_id = ::rocprim::detail::block_id<0>(); + + const unsigned int index = block_id * BlockSize + flat_id; + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - histogram[channel][index] = 0; + if(index < bins[channel]) + { + histogram[channel][index] = 0; + } } } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - unsigned int ActiveChannels, - class SampleIterator, - class Counter, - class SampleToBinOp -> -ROCPRIM_DEVICE inline -void histogram_shared(SampleIterator samples, - unsigned int columns, - unsigned int rows, - unsigned int row_stride, - unsigned int rows_per_block, - fixed_array histogram, - fixed_array sample_to_bin_op, - fixed_array bins, - unsigned int * block_histogram_start) -{ - using sample_type = typename std::iterator_traits::value_type; - using sample_vector_type = sample_vector; - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + template + ROCPRIM_DEVICE inline void + histogram_shared(SampleIterator samples, + unsigned int columns, + unsigned int rows, + unsigned int row_stride, + unsigned int rows_per_block, + fixed_array histogram, + fixed_array sample_to_bin_op, + fixed_array bins, + unsigned int* block_histogram_start) + { + using sample_type = typename std::iterator_traits::value_type; + using sample_vector_type = sample_vector; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int block_id0 = ::rocprim::detail::block_id<0>(); - const unsigned int block_id1 = ::rocprim::detail::block_id<1>(); - const unsigned int grid_size0 = ::rocprim::detail::grid_size<0>(); + const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int block_id0 = ::rocprim::detail::block_id<0>(); + const unsigned int block_id1 = ::rocprim::detail::block_id<1>(); + const unsigned int grid_size0 = ::rocprim::detail::grid_size<0>(); - unsigned int * block_histogram[ActiveChannels]; - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - block_histogram[channel] = block_histogram_start; - block_histogram_start += bins[channel]; - for(unsigned int bin = flat_id; bin < bins[channel]; bin += BlockSize) + unsigned int* block_histogram[ActiveChannels]; + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - block_histogram[channel][bin] = 0; + block_histogram[channel] = block_histogram_start; + block_histogram_start += bins[channel]; + for(unsigned int bin = flat_id; bin < bins[channel]; bin += BlockSize) + { + block_histogram[channel][bin] = 0; + } } - } - ::rocprim::syncthreads(); + ::rocprim::syncthreads(); - const unsigned int start_row = block_id1 * rows_per_block; - const unsigned int end_row = ::rocprim::min(rows, start_row + rows_per_block); - for(unsigned int row = start_row; row < end_row; row++) - { - SampleIterator row_samples = samples + row * row_stride; - - unsigned int block_offset = block_id0 * items_per_block; - while(block_offset < columns) + const unsigned int start_row = block_id1 * rows_per_block; + const unsigned int end_row = ::rocprim::min(rows, start_row + rows_per_block); + for(unsigned int row = start_row; row < end_row; row++) { - sample_vector_type values[ItemsPerThread]; + SampleIterator row_samples = samples + row * row_stride; - if(block_offset + items_per_block <= columns) + unsigned int block_offset = block_id0 * items_per_block; + while(block_offset < columns) { - load_samples(flat_id, row_samples + Channels * block_offset, values); + sample_vector_type values[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) + if(block_offset + items_per_block <= columns) { - for(unsigned int channel = 0; channel < ActiveChannels; channel++) + load_samples(flat_id, row_samples + Channels * block_offset, values); + + for(unsigned int i = 0; i < ItemsPerThread; i++) { - unsigned int bin; - if(sample_to_bin_op[channel](values[i].values[channel], bin)) + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - ::rocprim::detail::atomic_add(&block_histogram[channel][bin], 1); + unsigned int bin; + if(sample_to_bin_op[channel](values[i].values[channel], bin)) + { + ::rocprim::detail::atomic_add(&block_histogram[channel][bin], 1); + } } } } - } - else - { - const unsigned int valid_count = columns - block_offset; - load_samples(flat_id, row_samples + Channels * block_offset, values, valid_count); - - for(unsigned int i = 0; i < ItemsPerThread; i++) + else { - if(flat_id * ItemsPerThread + i < valid_count) + const unsigned int valid_count = columns - block_offset; + load_samples( + flat_id, row_samples + Channels * block_offset, values, valid_count); + + for(unsigned int i = 0; i < ItemsPerThread; i++) { - for(unsigned int channel = 0; channel < ActiveChannels; channel++) + if(flat_id * ItemsPerThread + i < valid_count) { - unsigned int bin; - if(sample_to_bin_op[channel](values[i].values[channel], bin)) + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - ::rocprim::detail::atomic_add(&block_histogram[channel][bin], 1); + unsigned int bin; + if(sample_to_bin_op[channel](values[i].values[channel], bin)) + { + ::rocprim::detail::atomic_add(&block_histogram[channel][bin], + 1); + } } } } } - } - block_offset += grid_size0 * items_per_block; + block_offset += grid_size0 * items_per_block; + } } - } - ::rocprim::syncthreads(); + ::rocprim::syncthreads(); - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - for(unsigned int bin = flat_id; bin < bins[channel]; bin += BlockSize) + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - if(block_histogram[channel][bin] > 0) + for(unsigned int bin = flat_id; bin < bins[channel]; bin += BlockSize) { - ::rocprim::detail::atomic_add(&histogram[channel][bin], block_histogram[channel][bin]); + if(block_histogram[channel][bin] > 0) + { + ::rocprim::detail::atomic_add(&histogram[channel][bin], + block_histogram[channel][bin]); + } } } } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - unsigned int ActiveChannels, - class SampleIterator, - class Counter, - class SampleToBinOp -> -ROCPRIM_DEVICE inline -void histogram_global(SampleIterator samples, - unsigned int columns, - unsigned int row_stride, - fixed_array histogram, - fixed_array sample_to_bin_op, - fixed_array bins_bits) -{ - using sample_type = typename std::iterator_traits::value_type; - using sample_vector_type = sample_vector; - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + template + ROCPRIM_DEVICE inline void + histogram_global(SampleIterator samples, + unsigned int columns, + unsigned int row_stride, + fixed_array histogram, + fixed_array sample_to_bin_op, + fixed_array bins_bits) + { + using sample_type = typename std::iterator_traits::value_type; + using sample_vector_type = sample_vector; - const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int block_id0 = ::rocprim::detail::block_id<0>(); - const unsigned int block_id1 = ::rocprim::detail::block_id<1>(); - const unsigned int block_offset = block_id0 * items_per_block; + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - samples += block_id1 * row_stride + Channels * block_offset; + const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int block_id0 = ::rocprim::detail::block_id<0>(); + const unsigned int block_id1 = ::rocprim::detail::block_id<1>(); + const unsigned int block_offset = block_id0 * items_per_block; - sample_vector_type values[ItemsPerThread]; - unsigned int valid_count; - if(block_offset + items_per_block <= columns) - { - valid_count = items_per_block; - load_samples(flat_id, samples, values); - } - else - { - valid_count = columns - block_offset; - load_samples(flat_id, samples, values, valid_count); - } + samples += block_id1 * row_stride + Channels * block_offset; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - for(unsigned int channel = 0; channel < ActiveChannels; channel++) + sample_vector_type values[ItemsPerThread]; + unsigned int valid_count; + if(block_offset + items_per_block <= columns) + { + valid_count = items_per_block; + load_samples(flat_id, samples, values); + } + else + { + valid_count = columns - block_offset; + load_samples(flat_id, samples, values, valid_count); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) { - unsigned int bin; - if(sample_to_bin_op[channel](values[i].values[channel], bin)) + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - const unsigned int pos = flat_id * ItemsPerThread + i; - unsigned long long same_bin_lanes_mask = ::rocprim::ballot(pos < valid_count); - for(unsigned int b = 0; b < bins_bits[channel]; b++) - { - const unsigned int bit_set = bin & (1u << b); - const unsigned long long bit_set_mask = ::rocprim::ballot(bit_set); - same_bin_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); - } - const unsigned int same_bin_count = ::rocprim::bit_count(same_bin_lanes_mask); - const unsigned int prev_same_bin_count = ::rocprim::masked_bit_count(same_bin_lanes_mask); - if(prev_same_bin_count == 0) + unsigned int bin; + if(sample_to_bin_op[channel](values[i].values[channel], bin)) { - // Write the number of lanes having this bin, - // if the current lane is the first (and maybe only) lane with this bin. - ::rocprim::detail::atomic_add(&histogram[channel][bin], same_bin_count); + const unsigned int pos = flat_id * ItemsPerThread + i; + unsigned long long same_bin_lanes_mask = ::rocprim::ballot(pos < valid_count); + for(unsigned int b = 0; b < bins_bits[channel]; b++) + { + const unsigned int bit_set = bin & (1u << b); + const unsigned long long bit_set_mask = ::rocprim::ballot(bit_set); + same_bin_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); + } + const unsigned int same_bin_count = ::rocprim::bit_count(same_bin_lanes_mask); + const unsigned int prev_same_bin_count + = ::rocprim::masked_bit_count(same_bin_lanes_mask); + if(prev_same_bin_count == 0) + { + // Write the number of lanes having this bin, + // if the current lane is the first (and maybe only) lane with this bin. + ::rocprim::detail::atomic_add(&histogram[channel][bin], same_bin_count); + } } } } } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_merge.hpp b/rocprim/include/rocprim/device/detail/device_merge.hpp index 40043ee6e..32c47a8d2 100644 --- a/rocprim/include/rocprim/device/detail/device_merge.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge.hpp @@ -21,14 +21,14 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_MERGE_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_MERGE_HPP_ -#include #include +#include #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_store.hpp" @@ -38,415 +38,357 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -struct range_t -{ - unsigned int begin1; - unsigned int end1; - unsigned int begin2; - unsigned int end2; - - ROCPRIM_DEVICE inline - unsigned int count1() + struct range_t { - return end1 - begin1; - } - - ROCPRIM_DEVICE inline - unsigned int count2() - { - return end2 - begin2; - } -}; - -ROCPRIM_DEVICE inline -range_t compute_range(const unsigned int id, - const unsigned int size1, - const unsigned int size2, - const unsigned int spacing, - const unsigned int p1, - const unsigned int p2) -{ - unsigned int diag1 = id * spacing; - unsigned int diag2 = min(size1 + size2, diag1 + spacing); - - return range_t{p1, p2, diag1 - p1, diag2 - p2}; -} - -template< - class KeysInputIterator1, - class KeysInputIterator2, - class BinaryFunction -> -ROCPRIM_DEVICE inline -unsigned int merge_path(KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - const size_t input1_size, - const size_t input2_size, - const unsigned int diag, - BinaryFunction compare_function) -{ - using key_type = typename std::iterator_traits::value_type; - - int begin = max((int)0, (int)diag - (int)input2_size); - int end = min((int)diag, (int)input1_size); + unsigned int begin1; + unsigned int end1; + unsigned int begin2; + unsigned int end2; - while(begin < end) - { - unsigned int a = (begin + end) / 2; - unsigned int b = diag - 1 - a; - key_type input_a = keys_input1[a]; - key_type input_b = keys_input2[b]; - if(!compare_function(input_b, input_a)) + ROCPRIM_DEVICE inline unsigned int count1() { - begin = a + 1; + return end1 - begin1; } - else + + ROCPRIM_DEVICE inline unsigned int count2() { - end = a; + return end2 - begin2; } + }; + + ROCPRIM_DEVICE inline range_t compute_range(const unsigned int id, + const unsigned int size1, + const unsigned int size2, + const unsigned int spacing, + const unsigned int p1, + const unsigned int p2) + { + unsigned int diag1 = id * spacing; + unsigned int diag2 = min(size1 + size2, diag1 + spacing); + + return range_t {p1, p2, diag1 - p1, diag2 - p2}; } - return begin; -} - -template< - class IndexIterator, - class KeysInputIterator1, - class KeysInputIterator2, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void partition_kernel_impl(IndexIterator indices, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - const size_t input1_size, - const size_t input2_size, - const unsigned int spacing, - BinaryFunction compare_function) -{ - const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - const unsigned int flat_block_size = ::rocprim::detail::block_size<0>(); - - unsigned int id = flat_block_id * flat_block_size + flat_id; - - unsigned int partition_id = id * spacing; - unsigned int diag = min(partition_id, (unsigned int)(input1_size + input2_size)); - - unsigned int begin = - merge_path( - keys_input1, - keys_input2, - input1_size, - input2_size, - diag, - compare_function - ); - - indices[id] = begin; -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class KeysInputIterator1, - class KeysInputIterator2, - class KeyType -> -ROCPRIM_DEVICE inline -void load(unsigned int flat_id, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - KeyType * keys_shared, - const size_t input1_size, - const size_t input2_size) -{ - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; ++i) + template + ROCPRIM_DEVICE inline unsigned int merge_path(KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + const size_t input1_size, + const size_t input2_size, + const unsigned int diag, + BinaryFunction compare_function) { - unsigned int index = BlockSize * i + flat_id; - if(index < input1_size) - { - keys_shared[index] = keys_input1[index]; - } - else if(index < input1_size + input2_size) + using key_type = typename std::iterator_traits::value_type; + + int begin = max((int)0, (int)diag - (int)input2_size); + int end = min((int)diag, (int)input1_size); + + while(begin < end) { - keys_shared[index] = keys_input2[index - input1_size]; + unsigned int a = (begin + end) / 2; + unsigned int b = diag - 1 - a; + key_type input_a = keys_input1[a]; + key_type input_b = keys_input2[b]; + if(!compare_function(input_b, input_a)) + { + begin = a + 1; + } + else + { + end = a; + } } - } - ::rocprim::syncthreads(); -} - -template< - class KeyType, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void serial_merge(KeyType * keys_shared, - KeyType (&inputs)[ItemsPerThread], - unsigned int (&index)[ItemsPerThread], - range_t range, - BinaryFunction compare_function) -{ - KeyType a = keys_shared[range.begin1]; - KeyType b = keys_shared[range.begin2]; + return begin; + } - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; ++i) + template + ROCPRIM_DEVICE inline void partition_kernel_impl(IndexIterator indices, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + const size_t input1_size, + const size_t input2_size, + const unsigned int spacing, + BinaryFunction compare_function) { - bool compare = (range.begin2 >= range.end2) || - ((range.begin1 < range.end1) && !compare_function(b, a)); - unsigned int x = compare ? range.begin1 : range.begin2; + const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + const unsigned int flat_block_size = ::rocprim::detail::block_size<0>(); - inputs[i] = compare ? a : b; - index[i] = x; + unsigned int id = flat_block_id * flat_block_size + flat_id; - KeyType c = keys_shared[++x]; - if(compare) - { - a = c; - range.begin1 = x; - } - else - { - b = c; - range.begin2 = x; - } - } - ::rocprim::syncthreads(); -} - -template< - unsigned int BlockSize, - class KeysInputIterator1, - class KeysInputIterator2, - class KeyType, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void merge_keys(unsigned int flat_id, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - KeyType (&key_inputs)[ItemsPerThread], - unsigned int (&index)[ItemsPerThread], - KeyType * keys_shared, - range_t range, - BinaryFunction compare_function) -{ - load( - flat_id, keys_input1 + range.begin1, keys_input2 + range.begin2, - keys_shared, range.count1(), range.count2() - ); - - range_t range_local = - range_t { - 0, range.count1(), range.count1(), - (range.count1() + range.count2()) - }; - - unsigned int diag = ItemsPerThread * flat_id; - unsigned int partition = - merge_path( - keys_shared + range_local.begin1, - keys_shared + range_local.begin2, - range_local.count1(), - range_local.count2(), - diag, - compare_function - ); - - range_t range_partition = - range_t { - range_local.begin1 + partition, - range_local.end1, - range_local.begin2 + diag - partition, - range_local.end2 - }; - - serial_merge( - keys_shared, key_inputs, index, range_partition, - compare_function - ); -} - -template< - bool WithValues, - unsigned int BlockSize, - class ValuesInputIterator1, - class ValuesInputIterator2, - class ValuesOutputIterator, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -merge_values(unsigned int flat_id, - ValuesInputIterator1 values_input1, - ValuesInputIterator2 values_input2, - ValuesOutputIterator values_output, - unsigned int (&index)[ItemsPerThread], - const size_t input1_size, - const size_t input2_size) -{ - using value_type = typename std::iterator_traits::value_type; + unsigned int partition_id = id * spacing; + unsigned int diag = min(partition_id, (unsigned int)(input1_size + input2_size)); - unsigned int count = input1_size + input2_size; + unsigned int begin = merge_path( + keys_input1, keys_input2, input1_size, input2_size, diag, compare_function); - value_type values[ItemsPerThread]; + indices[id] = begin; + } - if(count >= ItemsPerThread * BlockSize) + template + ROCPRIM_DEVICE inline void load(unsigned int flat_id, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + KeyType* keys_shared, + const size_t input1_size, + const size_t input2_size) { - #pragma unroll +#pragma unroll for(unsigned int i = 0; i < ItemsPerThread; ++i) { - values[i] = (index[i] < input1_size) ? values_input1[index[i]] : - values_input2[index[i] - input1_size]; + unsigned int index = BlockSize * i + flat_id; + if(index < input1_size) + { + keys_shared[index] = keys_input1[index]; + } + else if(index < input1_size + input2_size) + { + keys_shared[index] = keys_input2[index - input1_size]; + } } + + ::rocprim::syncthreads(); } - else + + template + ROCPRIM_DEVICE inline void serial_merge(KeyType* keys_shared, + KeyType (&inputs)[ItemsPerThread], + unsigned int (&index)[ItemsPerThread], + range_t range, + BinaryFunction compare_function) { - #pragma unroll + KeyType a = keys_shared[range.begin1]; + KeyType b = keys_shared[range.begin2]; + +#pragma unroll for(unsigned int i = 0; i < ItemsPerThread; ++i) { - if(flat_id * ItemsPerThread + i < count) + bool compare = (range.begin2 >= range.end2) + || ((range.begin1 < range.end1) && !compare_function(b, a)); + unsigned int x = compare ? range.begin1 : range.begin2; + + inputs[i] = compare ? a : b; + index[i] = x; + + KeyType c = keys_shared[++x]; + if(compare) + { + a = c; + range.begin1 = x; + } + else { - values[i] = (index[i] < input1_size) ? values_input1[index[i]] : - values_input2[index[i] - input1_size]; + b = c; + range.begin2 = x; } } + ::rocprim::syncthreads(); } - ::rocprim::syncthreads(); - - block_store_direct_blocked( - flat_id, - values_output, - values, - count - ); -} - -template< - bool WithValues, - unsigned int BlockSize, - class ValuesInputIterator1, - class ValuesInputIterator2, - class ValuesOutputIterator, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -merge_values(unsigned int flat_id, - ValuesInputIterator1 values_input1, - ValuesInputIterator2 values_input2, - ValuesOutputIterator values_output, - unsigned int (&index)[ItemsPerThread], - const size_t input1_size, - const size_t input2_size) -{ - (void) flat_id; - (void) values_input1; - (void) values_input2; - (void) values_output; - (void) index; - (void) input1_size; - (void) input2_size; -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class IndexIterator, - class KeysInputIterator1, - class KeysInputIterator2, - class KeysOutputIterator, - class ValuesInputIterator1, - class ValuesInputIterator2, - class ValuesOutputIterator, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void merge_kernel_impl(IndexIterator indices, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - KeysOutputIterator keys_output, - ValuesInputIterator1 values_input1, - ValuesInputIterator2 values_input2, - ValuesOutputIterator values_output, - const size_t input1_size, - const size_t input2_size, - BinaryFunction compare_function) -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; - using keys_store_type = ::rocprim::block_store< - key_type, BlockSize, ItemsPerThread, - ::rocprim::block_store_method::block_store_transpose - >; - constexpr bool with_values = !std::is_same::value; - - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - constexpr unsigned int input_block_size = BlockSize * ItemsPerThread + 1; - - ROCPRIM_SHARED_MEMORY union + template + ROCPRIM_DEVICE inline void merge_keys(unsigned int flat_id, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + KeyType (&key_inputs)[ItemsPerThread], + unsigned int (&index)[ItemsPerThread], + KeyType* keys_shared, + range_t range, + BinaryFunction compare_function) { - typename detail::raw_storage keys_shared; - typename keys_store_type::storage_type keys_store; - } storage; - - key_type input[ItemsPerThread]; - unsigned int index[ItemsPerThread]; - - const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - const unsigned int block_offset = flat_block_id * items_per_block; - const unsigned int count = input1_size + input2_size; - const unsigned int number_of_blocks = (count + items_per_block - 1)/items_per_block; - const auto valid_in_last_block = count - items_per_block * (number_of_blocks - 1); - - const unsigned int p1 = indices[flat_block_id]; - const unsigned int p2 = indices[flat_block_id + 1]; - - range_t range = - compute_range( - flat_block_id, input1_size, input2_size, items_per_block, - p1, p2 - ); - - merge_keys( - flat_id, keys_input1, keys_input2, input, index, - storage.keys_shared.get(), - range, compare_function - ); - - ::rocprim::syncthreads(); - - if(flat_block_id == (number_of_blocks - 1)) // last block + load(flat_id, + keys_input1 + range.begin1, + keys_input2 + range.begin2, + keys_shared, + range.count1(), + range.count2()); + + range_t range_local + = range_t {0, range.count1(), range.count1(), (range.count1() + range.count2())}; + + unsigned int diag = ItemsPerThread * flat_id; + unsigned int partition = merge_path(keys_shared + range_local.begin1, + keys_shared + range_local.begin2, + range_local.count1(), + range_local.count2(), + diag, + compare_function); + + range_t range_partition = range_t {range_local.begin1 + partition, + range_local.end1, + range_local.begin2 + diag - partition, + range_local.end2}; + + serial_merge(keys_shared, key_inputs, index, range_partition, compare_function); + } + + template + ROCPRIM_DEVICE inline typename std::enable_if::type + merge_values(unsigned int flat_id, + ValuesInputIterator1 values_input1, + ValuesInputIterator2 values_input2, + ValuesOutputIterator values_output, + unsigned int (&index)[ItemsPerThread], + const size_t input1_size, + const size_t input2_size) { - keys_store_type().store( - keys_output + block_offset, - input, - valid_in_last_block, - storage.keys_store - ); + using value_type = typename std::iterator_traits::value_type; + + unsigned int count = input1_size + input2_size; + + value_type values[ItemsPerThread]; + + if(count >= ItemsPerThread * BlockSize) + { +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + values[i] = (index[i] < input1_size) ? values_input1[index[i]] + : values_input2[index[i] - input1_size]; + } + } + else + { +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; ++i) + { + if(flat_id * ItemsPerThread + i < count) + { + values[i] = (index[i] < input1_size) ? values_input1[index[i]] + : values_input2[index[i] - input1_size]; + } + } + } + + ::rocprim::syncthreads(); + + block_store_direct_blocked(flat_id, values_output, values, count); } - else + + template + ROCPRIM_DEVICE inline typename std::enable_if::type + merge_values(unsigned int flat_id, + ValuesInputIterator1 values_input1, + ValuesInputIterator2 values_input2, + ValuesOutputIterator values_output, + unsigned int (&index)[ItemsPerThread], + const size_t input1_size, + const size_t input2_size) { - keys_store_type().store( - keys_output + block_offset, - input, - storage.keys_store - ); + (void)flat_id; + (void)values_input1; + (void)values_input2; + (void)values_output; + (void)index; + (void)input1_size; + (void)input2_size; } - merge_values( - flat_id, values_input1 + range.begin1, values_input2 + range.begin2, - values_output + block_offset, index, - range.count1(), range.count2() - ); -} + template + ROCPRIM_DEVICE inline void merge_kernel_impl(IndexIterator indices, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + KeysOutputIterator keys_output, + ValuesInputIterator1 values_input1, + ValuesInputIterator2 values_input2, + ValuesOutputIterator values_output, + const size_t input1_size, + const size_t input2_size, + BinaryFunction compare_function) + { + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + using keys_store_type + = ::rocprim::block_store; + constexpr bool with_values = !std::is_same::value; + + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + constexpr unsigned int input_block_size = BlockSize * ItemsPerThread + 1; + + ROCPRIM_SHARED_MEMORY union + { + typename detail::raw_storage keys_shared; + typename keys_store_type::storage_type keys_store; + } storage; + + key_type input[ItemsPerThread]; + unsigned int index[ItemsPerThread]; + + const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + const unsigned int block_offset = flat_block_id * items_per_block; + const unsigned int count = input1_size + input2_size; + const unsigned int number_of_blocks = (count + items_per_block - 1) / items_per_block; + const auto valid_in_last_block = count - items_per_block * (number_of_blocks - 1); + + const unsigned int p1 = indices[flat_block_id]; + const unsigned int p2 = indices[flat_block_id + 1]; + + range_t range + = compute_range(flat_block_id, input1_size, input2_size, items_per_block, p1, p2); + + merge_keys(flat_id, + keys_input1, + keys_input2, + input, + index, + storage.keys_shared.get(), + range, + compare_function); + + ::rocprim::syncthreads(); + + if(flat_block_id == (number_of_blocks - 1)) // last block + { + keys_store_type().store( + keys_output + block_offset, input, valid_in_last_block, storage.keys_store); + } + else + { + keys_store_type().store(keys_output + block_offset, input, storage.keys_store); + } + + merge_values(flat_id, + values_input1 + range.begin1, + values_input2 + range.begin2, + values_output + block_offset, + index, + range.count1(), + range.count2()); + } } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp index a2a84fef2..92d1bcc90 100644 --- a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp @@ -21,14 +21,14 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_SORT_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_SORT_HPP_ -#include #include +#include #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_load.hpp" @@ -40,452 +40,345 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - bool WithValues, - unsigned int BlockSize, - class KeysInputIterator, - class ValuesInputIterator, - class Key, - class Value, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -block_load_impl(const unsigned int flat_id, - const unsigned int block_offset, - const unsigned int valid_in_last_block, - const bool last_block, - KeysInputIterator keys_input, - ValuesInputIterator values_input, - Key (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread]) -{ - (void) values_input; - (void) values; - - if(last_block) - { - block_load_direct_striped( - flat_id, - keys_input + block_offset, - keys, - valid_in_last_block - ); - } - else + template + ROCPRIM_DEVICE inline typename std::enable_if::type + block_load_impl(const unsigned int flat_id, + const unsigned int block_offset, + const unsigned int valid_in_last_block, + const bool last_block, + KeysInputIterator keys_input, + ValuesInputIterator values_input, + Key (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread]) { - block_load_direct_striped( - flat_id, - keys_input + block_offset, - keys - ); - } + (void)values_input; + (void)values; -} - -template< - bool WithValues, - unsigned int BlockSize, - class KeysInputIterator, - class ValuesInputIterator, - class Key, - class Value, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -block_load_impl(const unsigned int flat_id, - const unsigned int block_offset, - const unsigned int valid_in_last_block, - const bool last_block, - KeysInputIterator keys_input, - ValuesInputIterator values_input, - Key (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread]) -{ - if(last_block) - { - block_load_direct_striped( - flat_id, - keys_input + block_offset, - keys, - valid_in_last_block - ); - - block_load_direct_striped( - flat_id, - values_input + block_offset, - values, - valid_in_last_block - ); - } - else - { - block_load_direct_striped( - flat_id, - keys_input + block_offset, - keys - ); - - block_load_direct_striped( - flat_id, - values_input + block_offset, - values - ); + if(last_block) + { + block_load_direct_striped( + flat_id, keys_input + block_offset, keys, valid_in_last_block); + } + else + { + block_load_direct_striped(flat_id, keys_input + block_offset, keys); + } } -} - -template< - bool WithValues, - unsigned int BlockSize, - class KeysOutputIterator, - class ValuesOutputIterator, - class Key, - class Value, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -block_store_impl(const unsigned int flat_id, - const unsigned int block_offset, - const unsigned int valid_in_last_block, - const bool last_block, - KeysOutputIterator keys_output, - ValuesOutputIterator values_output, - Key (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread]) -{ - (void) values_output; - (void) values; - if(last_block) + template + ROCPRIM_DEVICE inline typename std::enable_if::type + block_load_impl(const unsigned int flat_id, + const unsigned int block_offset, + const unsigned int valid_in_last_block, + const bool last_block, + KeysInputIterator keys_input, + ValuesInputIterator values_input, + Key (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread]) { - block_store_direct_striped( - flat_id, - keys_output + block_offset, - keys, - valid_in_last_block - ); + if(last_block) + { + block_load_direct_striped( + flat_id, keys_input + block_offset, keys, valid_in_last_block); + + block_load_direct_striped( + flat_id, values_input + block_offset, values, valid_in_last_block); + } + else + { + block_load_direct_striped(flat_id, keys_input + block_offset, keys); + + block_load_direct_striped(flat_id, values_input + block_offset, values); + } } - else + + template + ROCPRIM_DEVICE inline typename std::enable_if::type + block_store_impl(const unsigned int flat_id, + const unsigned int block_offset, + const unsigned int valid_in_last_block, + const bool last_block, + KeysOutputIterator keys_output, + ValuesOutputIterator values_output, + Key (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread]) { - block_store_direct_striped( - flat_id, - keys_output + block_offset, - keys - ); + (void)values_output; + (void)values; + + if(last_block) + { + block_store_direct_striped( + flat_id, keys_output + block_offset, keys, valid_in_last_block); + } + else + { + block_store_direct_striped(flat_id, keys_output + block_offset, keys); + } } -} - -template< - bool WithValues, - unsigned int BlockSize, - class KeysOutputIterator, - class ValuesOutputIterator, - class Key, - class Value, - unsigned int ItemsPerThread -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -block_store_impl(const unsigned int flat_id, - const unsigned int block_offset, - const unsigned int valid_in_last_block, - const bool last_block, - KeysOutputIterator keys_output, - ValuesOutputIterator values_output, - Key (&keys)[ItemsPerThread], - Value (&values)[ItemsPerThread]) -{ - if(last_block) + + template + ROCPRIM_DEVICE inline typename std::enable_if::type + block_store_impl(const unsigned int flat_id, + const unsigned int block_offset, + const unsigned int valid_in_last_block, + const bool last_block, + KeysOutputIterator keys_output, + ValuesOutputIterator values_output, + Key (&keys)[ItemsPerThread], + Value (&values)[ItemsPerThread]) { - block_store_direct_striped( - flat_id, - keys_output + block_offset, - keys, - valid_in_last_block - ); - - block_store_direct_striped( - flat_id, - values_output + block_offset, - values, - valid_in_last_block - ); + if(last_block) + { + block_store_direct_striped( + flat_id, keys_output + block_offset, keys, valid_in_last_block); + + block_store_direct_striped( + flat_id, values_output + block_offset, values, valid_in_last_block); + } + else + { + block_store_direct_striped(flat_id, keys_output + block_offset, keys); + + block_store_direct_striped(flat_id, values_output + block_offset, values); + } } - else + + template + ROCPRIM_DEVICE inline typename std::enable_if::type + block_sort_impl(Key& key, + Value& value, + const unsigned int valid_in_last_block, + const bool last_block, + BinaryFunction compare_function) { - block_store_direct_striped( - flat_id, - keys_output + block_offset, - keys - ); - - block_store_direct_striped( - flat_id, - values_output + block_offset, - values - ); - } -} - -template< - bool WithValues, - unsigned int BlockSize, - class Key, - class Value, - class BinaryFunction -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -block_sort_impl(Key& key, - Value& value, - const unsigned int valid_in_last_block, - const bool last_block, - BinaryFunction compare_function) -{ - using block_sort_type = ::rocprim::block_sort< - Key, BlockSize - >; + using block_sort_type = ::rocprim::block_sort; - ROCPRIM_SHARED_MEMORY typename block_sort_type::storage_type storage; + ROCPRIM_SHARED_MEMORY typename block_sort_type::storage_type storage; - (void) value; + (void)value; - if(last_block) - { - block_sort_type() - .sort( - key, // keys_input - storage, - valid_in_last_block, - compare_function - ); + if(last_block) + { + block_sort_type().sort(key, // keys_input + storage, + valid_in_last_block, + compare_function); + } + else + { + block_sort_type().sort(key, // keys_input + storage, + compare_function); + } } - else + + template + ROCPRIM_DEVICE inline typename std::enable_if::type + block_sort_impl(Key& key, + Value& value, + const unsigned int valid_in_last_block, + const bool last_block, + BinaryFunction compare_function) { - block_sort_type() - .sort( - key, // keys_input - storage, - compare_function - ); - } -} - -template< - bool WithValues, - unsigned int BlockSize, - class Key, - class Value, - class BinaryFunction -> -ROCPRIM_DEVICE inline -typename std::enable_if::type -block_sort_impl(Key& key, - Value& value, - const unsigned int valid_in_last_block, - const bool last_block, - BinaryFunction compare_function) -{ - using block_sort_type = ::rocprim::block_sort< - Key, BlockSize, Value - >; + using block_sort_type = ::rocprim::block_sort; - ROCPRIM_SHARED_MEMORY typename block_sort_type::storage_type storage; + ROCPRIM_SHARED_MEMORY typename block_sort_type::storage_type storage; - if(last_block) - { - block_sort_type() - .sort( - key, // keys_input - value, // values_input - storage, - valid_in_last_block, - compare_function - ); + if(last_block) + { + block_sort_type().sort(key, // keys_input + value, // values_input + storage, + valid_in_last_block, + compare_function); + } + else + { + block_sort_type().sort(key, // keys_input + value, // values_input + storage, + compare_function); + } } - else + + template + ROCPRIM_DEVICE inline void block_sort_kernel_impl(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const size_t input_size, + BinaryFunction compare_function) { - block_sort_type() - .sort( - key, // keys_input - value, // values_input - storage, - compare_function - ); + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + using stable_key_type = rocprim::tuple; + constexpr bool with_values = !std::is_same::value; + + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + const unsigned int block_offset = flat_block_id * BlockSize; + const unsigned int number_of_blocks = (input_size + BlockSize - 1) / BlockSize; + auto valid_in_last_block = input_size - BlockSize * (number_of_blocks - 1); + const bool last_block = flat_block_id == (number_of_blocks - 1); + + key_type key[1]; + value_type value[1]; + + block_load_impl(flat_id, + block_offset, + valid_in_last_block, + last_block, + keys_input, + values_input, + key, + value); + + // Special comparison that preserves relative order of equal keys + auto stable_compare_function = + [compare_function](const stable_key_type& a, const stable_key_type& b) mutable -> bool { + const bool ab = compare_function(rocprim::get<0>(a), rocprim::get<0>(b)); + const bool ba = compare_function(rocprim::get<0>(b), rocprim::get<0>(a)); + return ab || (!ba && (rocprim::get<1>(a) < rocprim::get<1>(b))); + }; + + stable_key_type stable_key = rocprim::make_tuple(key[0], flat_id); + block_sort_impl( + stable_key, value[0], valid_in_last_block, last_block, stable_compare_function); + key[0] = rocprim::get<0>(stable_key); + + block_store_impl(flat_id, + block_offset, + valid_in_last_block, + last_block, + keys_output, + values_output, + key, + value); } -} - -template< - unsigned int BlockSize, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void block_sort_kernel_impl(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const size_t input_size, - BinaryFunction compare_function) -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; - using stable_key_type = rocprim::tuple; - constexpr bool with_values = !std::is_same::value; - - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - const unsigned int block_offset = flat_block_id * BlockSize; - const unsigned int number_of_blocks = (input_size + BlockSize - 1)/BlockSize; - auto valid_in_last_block = input_size - BlockSize * (number_of_blocks - 1); - const bool last_block = flat_block_id == (number_of_blocks - 1); - - key_type key[1]; - value_type value[1]; - - block_load_impl( - flat_id, - block_offset, - valid_in_last_block, - last_block, - keys_input, - values_input, - key, - value - ); - - // Special comparison that preserves relative order of equal keys - auto stable_compare_function = [compare_function](const stable_key_type& a, const stable_key_type& b) mutable -> bool - { - const bool ab = compare_function(rocprim::get<0>(a), rocprim::get<0>(b)); - const bool ba = compare_function(rocprim::get<0>(b), rocprim::get<0>(a)); - return ab || (!ba && (rocprim::get<1>(a) < rocprim::get<1>(b))); - }; - - stable_key_type stable_key = rocprim::make_tuple(key[0], flat_id); - block_sort_impl( - stable_key, - value[0], - valid_in_last_block, - last_block, - stable_compare_function - ); - key[0] = rocprim::get<0>(stable_key); - - block_store_impl( - flat_id, - block_offset, - valid_in_last_block, - last_block, - keys_output, - values_output, - key, - value - ); -} - -template< - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void block_merge_kernel_impl(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const size_t input_size, - const unsigned int block_size, - BinaryFunction compare_function) -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; - constexpr bool with_values = !std::is_same::value; - const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - const unsigned int flat_block_size = ::rocprim::detail::block_size<0>(); - unsigned int id = (flat_block_id * flat_block_size) + flat_id; - - if (id >= input_size) + template + ROCPRIM_DEVICE inline void block_merge_kernel_impl(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const size_t input_size, + const unsigned int block_size, + BinaryFunction compare_function) { - return; - } + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + constexpr bool with_values = !std::is_same::value; - key_type key; - value_type value; + const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + const unsigned int flat_block_size = ::rocprim::detail::block_size<0>(); + unsigned int id = (flat_block_id * flat_block_size) + flat_id; - key = keys_input[id]; - if(with_values) - { - value = values_input[id]; - } + if(id >= input_size) + { + return; + } - const unsigned int block_id = id / block_size; - const bool block_id_is_odd = block_id & 1; - const unsigned int next_block_id = block_id_is_odd ? block_id - 1 : - block_id + 1; - const unsigned int block_start = min(block_id * block_size, (unsigned int) input_size); - const unsigned int next_block_start = min(next_block_id * block_size, (unsigned int) input_size); - const unsigned int next_block_end = min((next_block_id + 1) * block_size, (unsigned int) input_size); + key_type key; + value_type value; - if(next_block_start == input_size) - { - keys_output[id] = key; + key = keys_input[id]; if(with_values) { - values_output[id] = value; + value = values_input[id]; } - return; - } - unsigned int left_id = next_block_start; - unsigned int right_id = next_block_end; + const unsigned int block_id = id / block_size; + const bool block_id_is_odd = block_id & 1; + const unsigned int next_block_id = block_id_is_odd ? block_id - 1 : block_id + 1; + const unsigned int block_start = min(block_id * block_size, (unsigned int)input_size); + const unsigned int next_block_start + = min(next_block_id * block_size, (unsigned int)input_size); + const unsigned int next_block_end + = min((next_block_id + 1) * block_size, (unsigned int)input_size); - while(left_id < right_id) - { - unsigned int mid_id = (left_id + right_id) / 2; - key_type mid_key = keys_input[mid_id]; - bool smaller = compare_function(mid_key, key); - left_id = smaller ? mid_id + 1 : left_id; - right_id = smaller ? right_id : mid_id; - } + if(next_block_start == input_size) + { + keys_output[id] = key; + if(with_values) + { + values_output[id] = value; + } + return; + } - right_id = next_block_end; - if(block_id_is_odd && left_id != right_id) - { - key_type upper_key = keys_input[left_id]; - while(!compare_function(upper_key, key) && - !compare_function(key, upper_key) && - left_id < right_id) + unsigned int left_id = next_block_start; + unsigned int right_id = next_block_end; + + while(left_id < right_id) { - unsigned int mid_id = (left_id + right_id) / 2; - key_type mid_key = keys_input[mid_id]; - bool equal = !compare_function(mid_key, key) && - !compare_function(key, mid_key); - left_id = equal ? mid_id + 1 : left_id + 1; - right_id = equal ? right_id : mid_id; - upper_key = keys_input[left_id]; + unsigned int mid_id = (left_id + right_id) / 2; + key_type mid_key = keys_input[mid_id]; + bool smaller = compare_function(mid_key, key); + left_id = smaller ? mid_id + 1 : left_id; + right_id = smaller ? right_id : mid_id; } - } - unsigned int offset = 0; - offset += id - block_start; - offset += left_id - next_block_start; - offset += min(block_start, next_block_start); - keys_output[offset] = key; - if(with_values) - { - values_output[offset] = value; + right_id = next_block_end; + if(block_id_is_odd && left_id != right_id) + { + key_type upper_key = keys_input[left_id]; + while(!compare_function(upper_key, key) && !compare_function(key, upper_key) + && left_id < right_id) + { + unsigned int mid_id = (left_id + right_id) / 2; + key_type mid_key = keys_input[mid_id]; + bool equal = !compare_function(mid_key, key) && !compare_function(key, mid_key); + left_id = equal ? mid_id + 1 : left_id + 1; + right_id = equal ? right_id : mid_id; + upper_key = keys_input[left_id]; + } + } + + unsigned int offset = 0; + offset += id - block_start; + offset += left_id - next_block_start; + offset += min(block_start, next_block_start); + keys_output[offset] = key; + if(with_values) + { + values_output[offset] = value; + } } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_partition.hpp b/rocprim/include/rocprim/device/detail/device_partition.hpp index e38998b76..04df70500 100644 --- a/rocprim/include/rocprim/device/detail/device_partition.hpp +++ b/rocprim/include/rocprim/device/detail/device_partition.hpp @@ -21,17 +21,17 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_PARTITION_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_PARTITION_HPP_ -#include #include +#include #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_load.hpp" -#include "../../block/block_store.hpp" #include "../../block/block_scan.hpp" +#include "../../block/block_store.hpp" #include "device_scan_lookback.hpp" #include "lookback_scan_state.hpp" @@ -42,556 +42,493 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -class offset_lookback_scan_prefix_op : public lookback_scan_prefix_op, LookbackScanState> -{ - using base_type = lookback_scan_prefix_op, LookbackScanState>; - using binary_op_type = ::rocprim::plus; -public: - - struct storage_type + template + class offset_lookback_scan_prefix_op + : public lookback_scan_prefix_op, LookbackScanState> { - T block_reduction; - T exclusive_prefix; - }; + using base_type = lookback_scan_prefix_op, LookbackScanState>; + using binary_op_type = ::rocprim::plus; - ROCPRIM_DEVICE inline - offset_lookback_scan_prefix_op(unsigned int block_id, - LookbackScanState &state, - storage_type& storage) - : base_type(block_id, binary_op_type(), state), storage_(storage) - { - } + public: + struct storage_type + { + T block_reduction; + T exclusive_prefix; + }; - ROCPRIM_DEVICE inline - ~offset_lookback_scan_prefix_op() = default; + ROCPRIM_DEVICE inline offset_lookback_scan_prefix_op(unsigned int block_id, + LookbackScanState& state, + storage_type& storage) + : base_type(block_id, binary_op_type(), state) + , storage_(storage) + { + } - ROCPRIM_DEVICE inline - T operator()(T reduction) - { - auto prefix = base_type::operator()(reduction); - if(::rocprim::lane_id() == 0) + ROCPRIM_DEVICE inline ~offset_lookback_scan_prefix_op() = default; + + ROCPRIM_DEVICE inline T operator()(T reduction) { - storage_.block_reduction = reduction; - storage_.exclusive_prefix = prefix; + auto prefix = base_type::operator()(reduction); + if(::rocprim::lane_id() == 0) + { + storage_.block_reduction = reduction; + storage_.exclusive_prefix = prefix; + } + return prefix; } - return prefix; - } - ROCPRIM_DEVICE inline - T get_reduction() const - { - return storage_.block_reduction; - } + ROCPRIM_DEVICE inline T get_reduction() const + { + return storage_.block_reduction; + } - ROCPRIM_DEVICE inline - T get_exclusive_prefix() const - { - return storage_.exclusive_prefix; - } + ROCPRIM_DEVICE inline T get_exclusive_prefix() const + { + return storage_.exclusive_prefix; + } -private: - storage_type& storage_; -}; + private: + storage_type& storage_; + }; -enum class select_method -{ - flag = 0, - predicate = 1, - unique = 2 -}; - -template< - select_method SelectMethod, - unsigned int BlockSize, - class BlockLoadFlagsType, - class BlockDiscontinuityType, - class InputIterator, - class FlagIterator, - class ValueType, - unsigned int ItemsPerThread, - class UnaryPredicate, - class InequalityOp, - class StorageType -> -ROCPRIM_DEVICE inline -auto partition_block_load_flags(InputIterator /* block_predecessor */, - FlagIterator block_flags, - ValueType (&/* values */)[ItemsPerThread], - bool (&is_selected)[ItemsPerThread], - UnaryPredicate /* predicate */, - InequalityOp /* inequality_op */, - StorageType& storage, - const unsigned int /* block_id */, - const unsigned int /* block_thread_id */, - const bool is_last_block, - const unsigned int valid_in_last_block) - -> typename std::enable_if::type -{ - if(is_last_block) // last block + enum class select_method { - BlockLoadFlagsType() - .load( - block_flags, - is_selected, - valid_in_last_block, - false, - storage.load_flags - ); - } - else + flag = 0, + predicate = 1, + unique = 2 + }; + + template + ROCPRIM_DEVICE inline auto partition_block_load_flags(InputIterator /* block_predecessor */, + FlagIterator block_flags, + ValueType (&/* values */)[ItemsPerThread], + bool (&is_selected)[ItemsPerThread], + UnaryPredicate /* predicate */, + InequalityOp /* inequality_op */, + StorageType& storage, + const unsigned int /* block_id */, + const unsigned int /* block_thread_id */, + const bool is_last_block, + const unsigned int valid_in_last_block) -> + typename std::enable_if::type { - BlockLoadFlagsType() - .load( - block_flags, - is_selected, - storage.load_flags - ); + if(is_last_block) // last block + { + BlockLoadFlagsType().load( + block_flags, is_selected, valid_in_last_block, false, storage.load_flags); + } + else + { + BlockLoadFlagsType().load(block_flags, is_selected, storage.load_flags); + } + ::rocprim::syncthreads(); // sync threads to reuse shared memory } - ::rocprim::syncthreads(); // sync threads to reuse shared memory -} - -template< - select_method SelectMethod, - unsigned int BlockSize, - class BlockLoadFlagsType, - class BlockDiscontinuityType, - class InputIterator, - class FlagIterator, - class ValueType, - unsigned int ItemsPerThread, - class UnaryPredicate, - class InequalityOp, - class StorageType -> -ROCPRIM_DEVICE inline -auto partition_block_load_flags(InputIterator /* block_predecessor */, - FlagIterator /* block_flags */, - ValueType (&values)[ItemsPerThread], - bool (&is_selected)[ItemsPerThread], - UnaryPredicate predicate, - InequalityOp /* inequality_op */, - StorageType& /* storage */, - const unsigned int /* block_id */, - const unsigned int block_thread_id, - const bool is_last_block, - const unsigned int valid_in_last_block) - -> typename std::enable_if::type -{ - if(is_last_block) // last block + + template + ROCPRIM_DEVICE inline auto partition_block_load_flags(InputIterator /* block_predecessor */, + FlagIterator /* block_flags */, + ValueType (&values)[ItemsPerThread], + bool (&is_selected)[ItemsPerThread], + UnaryPredicate predicate, + InequalityOp /* inequality_op */, + StorageType& /* storage */, + const unsigned int /* block_id */, + const unsigned int block_thread_id, + const bool is_last_block, + const unsigned int valid_in_last_block) -> + typename std::enable_if::type { - const auto offset = block_thread_id * ItemsPerThread; - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) + if(is_last_block) // last block { - if((offset + i) < valid_in_last_block) + const auto offset = block_thread_id * ItemsPerThread; +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) { - is_selected[i] = predicate(values[i]); + if((offset + i) < valid_in_last_block) + { + is_selected[i] = predicate(values[i]); + } + else + { + is_selected[i] = false; + } } - else + } + else + { +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) { - is_selected[i] = false; + is_selected[i] = predicate(values[i]); } } } - else + + template + ROCPRIM_DEVICE inline auto partition_block_load_flags(InputIterator block_predecessor, + FlagIterator /* block_flags */, + ValueType (&values)[ItemsPerThread], + bool (&is_selected)[ItemsPerThread], + UnaryPredicate /* predicate */, + InequalityOp inequality_op, + StorageType& storage, + const unsigned int block_id, + const unsigned int block_thread_id, + const bool is_last_block, + const unsigned int valid_in_last_block) -> + typename std::enable_if::type { - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) + if(block_id > 0) { - is_selected[i] = predicate(values[i]); + const ValueType predecessor = *block_predecessor; + BlockDiscontinuityType().flag_heads( + is_selected, predecessor, values, inequality_op, storage.discontinuity_values); + } + else + { + BlockDiscontinuityType().flag_heads( + is_selected, values, inequality_op, storage.discontinuity_values); } - } -} - -template< - select_method SelectMethod, - unsigned int BlockSize, - class BlockLoadFlagsType, - class BlockDiscontinuityType, - class InputIterator, - class FlagIterator, - class ValueType, - unsigned int ItemsPerThread, - class UnaryPredicate, - class InequalityOp, - class StorageType -> -ROCPRIM_DEVICE inline -auto partition_block_load_flags(InputIterator block_predecessor, - FlagIterator /* block_flags */, - ValueType (&values)[ItemsPerThread], - bool (&is_selected)[ItemsPerThread], - UnaryPredicate /* predicate */, - InequalityOp inequality_op, - StorageType& storage, - const unsigned int block_id, - const unsigned int block_thread_id, - const bool is_last_block, - const unsigned int valid_in_last_block) - -> typename std::enable_if::type -{ - if(block_id > 0) - { - const ValueType predecessor = *block_predecessor; - BlockDiscontinuityType() - .flag_heads( - is_selected, - predecessor, - values, - inequality_op, - storage.discontinuity_values - ); - } - else - { - BlockDiscontinuityType() - .flag_heads( - is_selected, - values, - inequality_op, - storage.discontinuity_values - ); - } - // Set is_selected for invalid items to false - if(is_last_block) - { - const auto offset = block_thread_id * ItemsPerThread; - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) + // Set is_selected for invalid items to false + if(is_last_block) { - if((offset + i) >= valid_in_last_block) + const auto offset = block_thread_id * ItemsPerThread; +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) { - is_selected[i] = false; + if((offset + i) >= valid_in_last_block) + { + is_selected[i] = false; + } } } + ::rocprim::syncthreads(); // sync threads to reuse shared memory } - ::rocprim::syncthreads(); // sync threads to reuse shared memory -} - -template< - bool OnlySelected, - unsigned int BlockSize, - class ValueType, - unsigned int ItemsPerThread, - class OffsetType, - class OutputIterator, - class ScatterStorageType -> -ROCPRIM_DEVICE inline -auto partition_scatter(ValueType (&values)[ItemsPerThread], - bool (&is_selected)[ItemsPerThread], - OffsetType (&output_indices)[ItemsPerThread], - OutputIterator output, - const size_t size, - const OffsetType selected_prefix, - const OffsetType selected_in_block, - ScatterStorageType& storage, - const unsigned int flat_block_id, - const unsigned int flat_block_thread_id, - const bool is_last_block, - const unsigned int valid_in_last_block) - -> typename std::enable_if::type -{ - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - - // Scatter selected/rejected values to shared memory - auto scatter_storage = storage.get(); - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - unsigned int item_index = (flat_block_thread_id * ItemsPerThread) + i; - unsigned int selected_item_index = output_indices[i] - selected_prefix; - unsigned int rejected_item_index = (item_index - selected_item_index) + selected_in_block; - // index of item in scatter_storage - unsigned int scatter_index = is_selected[i] ? selected_item_index : rejected_item_index; - scatter_storage[scatter_index] = values[i]; - } - ::rocprim::syncthreads(); // sync threads to reuse shared memory - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) + template + ROCPRIM_DEVICE inline auto partition_scatter(ValueType (&values)[ItemsPerThread], + bool (&is_selected)[ItemsPerThread], + OffsetType (&output_indices)[ItemsPerThread], + OutputIterator output, + const size_t size, + const OffsetType selected_prefix, + const OffsetType selected_in_block, + ScatterStorageType& storage, + const unsigned int flat_block_id, + const unsigned int flat_block_thread_id, + const bool is_last_block, + const unsigned int valid_in_last_block) -> + typename std::enable_if::type { - unsigned int item_index = (i * BlockSize) + flat_block_thread_id; - unsigned int selected_item_index = item_index; - unsigned int rejected_item_index = item_index - selected_in_block; - // number of values rejected in previous blocks - unsigned int rejected_prefix = (flat_block_id * items_per_block) - selected_prefix; - // destination index of item scatter_storage[item_index] in output - OffsetType scatter_index = item_index < selected_in_block - ? selected_prefix + selected_item_index - : size - (rejected_prefix + rejected_item_index + 1); - - // last block can store only valid_in_last_block items - if(!is_last_block || item_index < valid_in_last_block) - { - output[scatter_index] = scatter_storage[item_index]; - } - } -} - -template< - bool OnlySelected, - unsigned int BlockSize, - class ValueType, - unsigned int ItemsPerThread, - class OffsetType, - class OutputIterator, - class ScatterStorageType -> -ROCPRIM_DEVICE inline -auto partition_scatter(ValueType (&values)[ItemsPerThread], - bool (&is_selected)[ItemsPerThread], - OffsetType (&output_indices)[ItemsPerThread], - OutputIterator output, - const size_t size, - const OffsetType selected_prefix, - const OffsetType selected_in_block, - ScatterStorageType& storage, - const unsigned int flat_block_id, - const unsigned int flat_block_thread_id, - const bool is_last_block, - const unsigned int valid_in_last_block) - -> typename std::enable_if::type -{ - (void) size; - (void) storage; - (void) flat_block_id; - (void) flat_block_thread_id; - (void) valid_in_last_block; + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - if(selected_in_block > BlockSize) - { - // Scatter selected values to shared memory + // Scatter selected/rejected values to shared memory auto scatter_storage = storage.get(); - #pragma unroll +#pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { - unsigned int scatter_index = output_indices[i] - selected_prefix; - if(is_selected[i]) - { - scatter_storage[scatter_index] = values[i]; - } + unsigned int item_index = (flat_block_thread_id * ItemsPerThread) + i; + unsigned int selected_item_index = output_indices[i] - selected_prefix; + unsigned int rejected_item_index + = (item_index - selected_item_index) + selected_in_block; + // index of item in scatter_storage + unsigned int scatter_index = is_selected[i] ? selected_item_index : rejected_item_index; + scatter_storage[scatter_index] = values[i]; } ::rocprim::syncthreads(); // sync threads to reuse shared memory - // Coalesced write from shared memory to global memory - for(unsigned int i = flat_block_thread_id; i < selected_in_block; i += BlockSize) +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) { - output[selected_prefix + i] = scatter_storage[i]; + unsigned int item_index = (i * BlockSize) + flat_block_thread_id; + unsigned int selected_item_index = item_index; + unsigned int rejected_item_index = item_index - selected_in_block; + // number of values rejected in previous blocks + unsigned int rejected_prefix = (flat_block_id * items_per_block) - selected_prefix; + // destination index of item scatter_storage[item_index] in output + OffsetType scatter_index = item_index < selected_in_block + ? selected_prefix + selected_item_index + : size - (rejected_prefix + rejected_item_index + 1); + + // last block can store only valid_in_last_block items + if(!is_last_block || item_index < valid_in_last_block) + { + output[scatter_index] = scatter_storage[item_index]; + } } } - else + + template + ROCPRIM_DEVICE inline auto partition_scatter(ValueType (&values)[ItemsPerThread], + bool (&is_selected)[ItemsPerThread], + OffsetType (&output_indices)[ItemsPerThread], + OutputIterator output, + const size_t size, + const OffsetType selected_prefix, + const OffsetType selected_in_block, + ScatterStorageType& storage, + const unsigned int flat_block_id, + const unsigned int flat_block_thread_id, + const bool is_last_block, + const unsigned int valid_in_last_block) -> + typename std::enable_if::type { - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) + (void)size; + (void)storage; + (void)flat_block_id; + (void)flat_block_thread_id; + (void)valid_in_last_block; + + if(selected_in_block > BlockSize) { - if(!is_last_block || output_indices[i] < (selected_prefix + selected_in_block)) + // Scatter selected values to shared memory + auto scatter_storage = storage.get(); +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) { + unsigned int scatter_index = output_indices[i] - selected_prefix; if(is_selected[i]) { - output[output_indices[i]] = values[i]; + scatter_storage[scatter_index] = values[i]; } } + ::rocprim::syncthreads(); // sync threads to reuse shared memory + + // Coalesced write from shared memory to global memory + for(unsigned int i = flat_block_thread_id; i < selected_in_block; i += BlockSize) + { + output[selected_prefix + i] = scatter_storage[i]; + } } - } -} - -template< - select_method SelectMethod, - bool OnlySelected, - class Config, - class InputIterator, - class FlagIterator, - class OutputIterator, - class SelectedCountOutputIterator, - class UnaryPredicate, - class InequalityOp, - class OffsetLookbackScanState -> -ROCPRIM_DEVICE inline -void partition_kernel_impl(InputIterator input, - FlagIterator flags, - OutputIterator output, - SelectedCountOutputIterator selected_count_output, - const size_t size, - UnaryPredicate predicate, - InequalityOp inequality_op, - OffsetLookbackScanState offset_scan_state, - const unsigned int number_of_blocks, - ordered_block_id ordered_bid) -{ - constexpr auto block_size = Config::block_size; - constexpr auto items_per_thread = Config::items_per_thread; - constexpr unsigned int items_per_block = block_size * items_per_thread; - - using offset_type = typename OffsetLookbackScanState::value_type; - using value_type = typename std::iterator_traits::value_type; - - // Block primitives - using block_load_value_type = ::rocprim::block_load< - value_type, block_size, items_per_thread, - Config::value_block_load_method - >; - using block_load_flag_type = ::rocprim::block_load< - bool, block_size, items_per_thread, - Config::value_block_load_method - >; - using block_scan_offset_type = ::rocprim::block_scan< - offset_type, block_size, - Config::block_scan_method - >; - using block_discontinuity_value_type = ::rocprim::block_discontinuity< - value_type, block_size - >; - using order_bid_type = ordered_block_id; - - // Offset prefix operation type - using offset_scan_prefix_op_type = offset_lookback_scan_prefix_op< - offset_type, OffsetLookbackScanState - >; - - // Memory required for 2-phase scatter - using exchange_storage_type = value_type[items_per_block]; - using raw_exchange_storage_type = typename detail::raw_storage; - - ROCPRIM_SHARED_MEMORY struct - { - typename order_bid_type::storage_type ordered_bid; - union + else { - raw_exchange_storage_type exchange_values; - typename block_load_value_type::storage_type load_values; - typename block_load_flag_type::storage_type load_flags; - typename block_discontinuity_value_type::storage_type discontinuity_values; - typename block_scan_offset_type::storage_type scan_offsets; - }; - } storage; - - const auto flat_block_thread_id = ::rocprim::flat_block_thread_id(); - const auto flat_block_id = ordered_bid.get(flat_block_thread_id, storage.ordered_bid); - const unsigned int block_offset = flat_block_id * items_per_block; - const auto valid_in_last_block = size - items_per_block * (number_of_blocks - 1); - - value_type values[items_per_thread]; - bool is_selected[items_per_thread]; - offset_type output_indices[items_per_thread]; - - // Load input values into values - bool is_last_block = flat_block_id == (number_of_blocks - 1); - if(is_last_block) // last block - { - block_load_value_type() - .load( - input + block_offset, - values, - valid_in_last_block, - storage.load_values - ); - } - else - { - block_load_value_type() - .load( - input + block_offset, - values, - storage.load_values - ); - } - ::rocprim::syncthreads(); // sync threads to reuse shared memory - - // Load selection flags into is_selected, generate them using - // input value and selection predicate, or generate them using - // block_discontinuity primitive - partition_block_load_flags< - SelectMethod, block_size, - block_load_flag_type, block_discontinuity_value_type - >( - input + block_offset - 1, - flags + block_offset, - values, - is_selected, - predicate, - inequality_op, - storage, - flat_block_id, - flat_block_thread_id, - is_last_block, - valid_in_last_block - ); - - // Convert true/false is_selected flags to 0s and 1s - #pragma unroll - for(unsigned int i = 0; i < items_per_thread; i++) - { - output_indices[i] = is_selected[i] ? 1 : 0; +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + if(!is_last_block || output_indices[i] < (selected_prefix + selected_in_block)) + { + if(is_selected[i]) + { + output[output_indices[i]] = values[i]; + } + } + } + } } - // Number of selected values in previous blocks - offset_type selected_prefix = 0; - // Number of selected values in this block - offset_type selected_in_block = 0; - - // Calculate number of selected values in block and their indices - if(flat_block_id == 0) + template + ROCPRIM_DEVICE inline void + partition_kernel_impl(InputIterator input, + FlagIterator flags, + OutputIterator output, + SelectedCountOutputIterator selected_count_output, + const size_t size, + UnaryPredicate predicate, + InequalityOp inequality_op, + OffsetLookbackScanState offset_scan_state, + const unsigned int number_of_blocks, + ordered_block_id ordered_bid) { - block_scan_offset_type() - .exclusive_scan( - output_indices, - output_indices, - offset_type(0), /** initial value */ - selected_in_block, - storage.scan_offsets, - ::rocprim::plus() - ); - if(flat_block_thread_id == 0) + constexpr auto block_size = Config::block_size; + constexpr auto items_per_thread = Config::items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; + + using offset_type = typename OffsetLookbackScanState::value_type; + using value_type = typename std::iterator_traits::value_type; + + // Block primitives + using block_load_value_type = ::rocprim:: + block_load; + using block_load_flag_type = ::rocprim:: + block_load; + using block_scan_offset_type + = ::rocprim::block_scan; + using block_discontinuity_value_type + = ::rocprim::block_discontinuity; + using order_bid_type = ordered_block_id; + + // Offset prefix operation type + using offset_scan_prefix_op_type + = offset_lookback_scan_prefix_op; + + // Memory required for 2-phase scatter + using exchange_storage_type = value_type[items_per_block]; + using raw_exchange_storage_type = typename detail::raw_storage; + + ROCPRIM_SHARED_MEMORY struct + { + typename order_bid_type::storage_type ordered_bid; + union + { + raw_exchange_storage_type exchange_values; + typename block_load_value_type::storage_type load_values; + typename block_load_flag_type::storage_type load_flags; + typename block_discontinuity_value_type::storage_type discontinuity_values; + typename block_scan_offset_type::storage_type scan_offsets; + }; + } storage; + + const auto flat_block_thread_id = ::rocprim::flat_block_thread_id(); + const auto flat_block_id = ordered_bid.get(flat_block_thread_id, storage.ordered_bid); + const unsigned int block_offset = flat_block_id * items_per_block; + const auto valid_in_last_block = size - items_per_block * (number_of_blocks - 1); + + value_type values[items_per_thread]; + bool is_selected[items_per_thread]; + offset_type output_indices[items_per_thread]; + + // Load input values into values + bool is_last_block = flat_block_id == (number_of_blocks - 1); + if(is_last_block) // last block { - offset_scan_state.set_complete(flat_block_id, selected_in_block); + block_load_value_type().load( + input + block_offset, values, valid_in_last_block, storage.load_values); + } + else + { + block_load_value_type().load(input + block_offset, values, storage.load_values); } - ::rocprim::syncthreads(); // sync threads to reuse shared memory - } - // Workaround: Fiji (gfx803) crashes with "Memory access fault by GPU node" on HCC 1.3.18482 (ROCm 2.0) - // Instead of just `} else {` we use `} syncthreads(); if() {`, because the else-branch can be executed - // for some unknown reason and 0-th block reads incorrect addresses in lookback_scan_prefix_op::get_prefix. - ::rocprim::syncthreads(); - if(flat_block_id > 0) - // end of the workaround - { - ROCPRIM_SHARED_MEMORY typename offset_scan_prefix_op_type::storage_type storage_prefix_op; - auto prefix_op = offset_scan_prefix_op_type( - flat_block_id, - offset_scan_state, - storage_prefix_op - ); - block_scan_offset_type() - .exclusive_scan( - output_indices, - output_indices, - storage.scan_offsets, - prefix_op, - ::rocprim::plus() - ); ::rocprim::syncthreads(); // sync threads to reuse shared memory - selected_in_block = prefix_op.get_reduction(); - selected_prefix = prefix_op.get_exclusive_prefix(); - } + // Load selection flags into is_selected, generate them using + // input value and selection predicate, or generate them using + // block_discontinuity primitive + partition_block_load_flags(input + block_offset - 1, + flags + block_offset, + values, + is_selected, + predicate, + inequality_op, + storage, + flat_block_id, + flat_block_thread_id, + is_last_block, + valid_in_last_block); + +// Convert true/false is_selected flags to 0s and 1s +#pragma unroll + for(unsigned int i = 0; i < items_per_thread; i++) + { + output_indices[i] = is_selected[i] ? 1 : 0; + } - // Scatter selected and rejected values - partition_scatter( - values, is_selected, output_indices, output, size, - selected_prefix, selected_in_block, storage.exchange_values, - flat_block_id, flat_block_thread_id, - is_last_block, valid_in_last_block - ); + // Number of selected values in previous blocks + offset_type selected_prefix = 0; + // Number of selected values in this block + offset_type selected_in_block = 0; - // Last block in grid stores number of selected values - if(flat_block_id == (number_of_blocks - 1) && flat_block_thread_id == 0) - { - selected_count_output[0] = selected_prefix + selected_in_block; + // Calculate number of selected values in block and their indices + if(flat_block_id == 0) + { + block_scan_offset_type().exclusive_scan(output_indices, + output_indices, + offset_type(0), /** initial value */ + selected_in_block, + storage.scan_offsets, + ::rocprim::plus()); + if(flat_block_thread_id == 0) + { + offset_scan_state.set_complete(flat_block_id, selected_in_block); + } + ::rocprim::syncthreads(); // sync threads to reuse shared memory + } + // Workaround: Fiji (gfx803) crashes with "Memory access fault by GPU node" on HCC 1.3.18482 (ROCm 2.0) + // Instead of just `} else {` we use `} syncthreads(); if() {`, because the else-branch can be executed + // for some unknown reason and 0-th block reads incorrect addresses in lookback_scan_prefix_op::get_prefix. + ::rocprim::syncthreads(); + if(flat_block_id > 0) + // end of the workaround + { + ROCPRIM_SHARED_MEMORY + typename offset_scan_prefix_op_type::storage_type storage_prefix_op; + auto prefix_op + = offset_scan_prefix_op_type(flat_block_id, offset_scan_state, storage_prefix_op); + block_scan_offset_type().exclusive_scan(output_indices, + output_indices, + storage.scan_offsets, + prefix_op, + ::rocprim::plus()); + ::rocprim::syncthreads(); // sync threads to reuse shared memory + + selected_in_block = prefix_op.get_reduction(); + selected_prefix = prefix_op.get_exclusive_prefix(); + } + + // Scatter selected and rejected values + partition_scatter(values, + is_selected, + output_indices, + output, + size, + selected_prefix, + selected_in_block, + storage.exchange_values, + flat_block_id, + flat_block_thread_id, + is_last_block, + valid_in_last_block); + + // Last block in grid stores number of selected values + if(flat_block_id == (number_of_blocks - 1) && flat_block_thread_id == 0) + { + selected_count_output[0] = selected_prefix + selected_in_block; + } } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_radix_sort.hpp b/rocprim/include/rocprim/device/detail/device_radix_sort.hpp index 7c9b33251..0b2a5ffa4 100644 --- a/rocprim/include/rocprim/device/detail/device_radix_sort.hpp +++ b/rocprim/include/rocprim/device/detail/device_radix_sort.hpp @@ -21,567 +21,584 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_RADIX_SORT_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_RADIX_SORT_HPP_ -#include #include +#include #include "../../config.hpp" -#include "../../detail/various.hpp" #include "../../detail/radix_sort.hpp" +#include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_discontinuity.hpp" #include "../../block/block_exchange.hpp" #include "../../block/block_load.hpp" #include "../../block/block_load_func.hpp" -#include "../../block/block_scan.hpp" #include "../../block/block_radix_sort.hpp" +#include "../../block/block_scan.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Wrapping functions that allow to call proper methods (with or without values) -// (a variant with values is enabled only when Value is not empty_type) -template -ROCPRIM_DEVICE inline -void sort_block(SortType sorter, - SortKey (&keys)[ItemsPerThread], - SortValue (&values)[ItemsPerThread], - typename SortType::storage_type& storage, - unsigned int begin_bit, - unsigned int end_bit) -{ - if(Descending) - { - sorter.sort_desc(keys, values, storage, begin_bit, end_bit); - } - else + // Wrapping functions that allow to call proper methods (with or without values) + // (a variant with values is enabled only when Value is not empty_type) + template + ROCPRIM_DEVICE inline void sort_block(SortType sorter, + SortKey (&keys)[ItemsPerThread], + SortValue (&values)[ItemsPerThread], + typename SortType::storage_type& storage, + unsigned int begin_bit, + unsigned int end_bit) { - sorter.sort(keys, values, storage, begin_bit, end_bit); - } -} - -template -ROCPRIM_DEVICE inline -void sort_block(SortType sorter, - SortKey (&keys)[ItemsPerThread], - ::rocprim::empty_type (&values)[ItemsPerThread], - typename SortType::storage_type& storage, - unsigned int begin_bit, - unsigned int end_bit) -{ - (void) values; - if(Descending) - { - sorter.sort_desc(keys, storage, begin_bit, end_bit); + if(Descending) + { + sorter.sort_desc(keys, values, storage, begin_bit, end_bit); + } + else + { + sorter.sort(keys, values, storage, begin_bit, end_bit); + } } - else + + template + ROCPRIM_DEVICE inline void sort_block(SortType sorter, + SortKey (&keys)[ItemsPerThread], + ::rocprim::empty_type (&values)[ItemsPerThread], + typename SortType::storage_type& storage, + unsigned int begin_bit, + unsigned int end_bit) { - sorter.sort(keys, storage, begin_bit, end_bit); + (void)values; + if(Descending) + { + sorter.sort_desc(keys, storage, begin_bit, end_bit); + } + else + { + sorter.sort(keys, storage, begin_bit, end_bit); + } } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits, - bool Descending -> -struct radix_digit_count_helper -{ - static constexpr unsigned int radix_size = 1 << RadixBits; - - static constexpr unsigned int warp_size = ::rocprim::warp_size(); - static constexpr unsigned int warps_no = BlockSize / warp_size; - static_assert(BlockSize % warp_size == 0, "BlockSize must be divisible by warp size"); - static_assert(radix_size <= BlockSize, "Radix size must not exceed BlockSize"); - struct storage_type + template + struct radix_digit_count_helper { - unsigned int digit_counts[warps_no][radix_size]; - }; + static constexpr unsigned int radix_size = 1 << RadixBits; - template - ROCPRIM_DEVICE inline - void count_digits(KeysInputIterator keys_input, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int bit, - unsigned int current_radix_bits, - storage_type& storage, - unsigned int& digit_count) // i-th thread will get i-th digit's value - { - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + static constexpr unsigned int warp_size = ::rocprim::warp_size(); + static constexpr unsigned int warps_no = BlockSize / warp_size; + static_assert(BlockSize % warp_size == 0, "BlockSize must be divisible by warp size"); + static_assert(radix_size <= BlockSize, "Radix size must not exceed BlockSize"); - using key_type = typename std::iterator_traits::value_type; + struct storage_type + { + unsigned int digit_counts[warps_no][radix_size]; + }; - using key_codec = radix_key_codec; - using bit_key_type = typename key_codec::bit_key_type; + template + ROCPRIM_DEVICE inline void + count_digits(KeysInputIterator keys_input, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int bit, + unsigned int current_radix_bits, + storage_type& storage, + unsigned int& digit_count) // i-th thread will get i-th digit's value + { + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int radix_mask = (1u << current_radix_bits) - 1; + using key_type = typename std::iterator_traits::value_type; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int warp_id = ::rocprim::warp_id(); + using key_codec = radix_key_codec; + using bit_key_type = typename key_codec::bit_key_type; - if(flat_id < radix_size) - { - for(unsigned int w = 0; w < warps_no; w++) - { - storage.digit_counts[w][flat_id] = 0; - } - } - ::rocprim::syncthreads(); + const unsigned int radix_mask = (1u << current_radix_bits) - 1; - for(unsigned int block_offset = begin_offset; block_offset < end_offset; block_offset += items_per_block) - { - key_type keys[ItemsPerThread]; - unsigned int valid_count; - // Use loading into a striped arrangement because an order of items is irrelevant, - // only totals matter - if(IsFull || (block_offset + items_per_block <= end_offset)) - { - valid_count = items_per_block; - block_load_direct_striped(flat_id, keys_input + block_offset, keys); - } - else + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int warp_id = ::rocprim::warp_id(); + + if(flat_id < radix_size) { - valid_count = end_offset - block_offset; - block_load_direct_striped(flat_id, keys_input + block_offset, keys, valid_count); + for(unsigned int w = 0; w < warps_no; w++) + { + storage.digit_counts[w][flat_id] = 0; + } } + ::rocprim::syncthreads(); - for(unsigned int i = 0; i < ItemsPerThread; i++) + for(unsigned int block_offset = begin_offset; block_offset < end_offset; + block_offset += items_per_block) { - const bit_key_type bit_key = key_codec::encode(keys[i]); - const unsigned int digit = (bit_key >> bit) & radix_mask; - const unsigned int pos = i * BlockSize + flat_id; - unsigned long long same_digit_lanes_mask = ::rocprim::ballot(IsFull || (pos < valid_count)); - for(unsigned int b = 0; b < RadixBits; b++) + key_type keys[ItemsPerThread]; + unsigned int valid_count; + // Use loading into a striped arrangement because an order of items is irrelevant, + // only totals matter + if(IsFull || (block_offset + items_per_block <= end_offset)) + { + valid_count = items_per_block; + block_load_direct_striped(flat_id, keys_input + block_offset, keys); + } + else { - const unsigned int bit_set = digit & (1u << b); - const unsigned long long bit_set_mask = ::rocprim::ballot(bit_set); - same_digit_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); + valid_count = end_offset - block_offset; + block_load_direct_striped( + flat_id, keys_input + block_offset, keys, valid_count); } - const unsigned int same_digit_count = ::rocprim::bit_count(same_digit_lanes_mask); - const unsigned int prev_same_digit_count = ::rocprim::masked_bit_count(same_digit_lanes_mask); - if(prev_same_digit_count == 0) + + for(unsigned int i = 0; i < ItemsPerThread; i++) { - // Write the number of lanes having this digit, - // if the current lane is the first (and maybe only) lane with this digit. - storage.digit_counts[warp_id][digit] += same_digit_count; + const bit_key_type bit_key = key_codec::encode(keys[i]); + const unsigned int digit = (bit_key >> bit) & radix_mask; + const unsigned int pos = i * BlockSize + flat_id; + unsigned long long same_digit_lanes_mask + = ::rocprim::ballot(IsFull || (pos < valid_count)); + for(unsigned int b = 0; b < RadixBits; b++) + { + const unsigned int bit_set = digit & (1u << b); + const unsigned long long bit_set_mask = ::rocprim::ballot(bit_set); + same_digit_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); + } + const unsigned int same_digit_count + = ::rocprim::bit_count(same_digit_lanes_mask); + const unsigned int prev_same_digit_count + = ::rocprim::masked_bit_count(same_digit_lanes_mask); + if(prev_same_digit_count == 0) + { + // Write the number of lanes having this digit, + // if the current lane is the first (and maybe only) lane with this digit. + storage.digit_counts[warp_id][digit] += same_digit_count; + } } } - } - ::rocprim::syncthreads(); + ::rocprim::syncthreads(); - digit_count = 0; - if(flat_id < radix_size) - { - for(unsigned int w = 0; w < warps_no; w++) + digit_count = 0; + if(flat_id < radix_size) { - digit_count += storage.digit_counts[w][flat_id]; + for(unsigned int w = 0; w < warps_no; w++) + { + digit_count += storage.digit_counts[w][flat_id]; + } } } - } -}; - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits, - bool Descending, - class Key, - class Value -> -struct radix_sort_and_scatter_helper -{ - static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - static constexpr unsigned int radix_size = 1 << RadixBits; - - using key_type = Key; - using value_type = Value; - - using key_codec = radix_key_codec; - using bit_key_type = typename key_codec::bit_key_type; - using keys_load_type = ::rocprim::block_load< - key_type, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using values_load_type = ::rocprim::block_load< - value_type, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using sort_type = ::rocprim::block_radix_sort; - using discontinuity_type = ::rocprim::block_discontinuity; - using bit_keys_exchange_type = ::rocprim::block_exchange; - using values_exchange_type = ::rocprim::block_exchange; - - static constexpr bool with_values = !std::is_same::value; - - struct storage_type - { - union - { - typename keys_load_type::storage_type keys_load; - typename values_load_type::storage_type values_load; - typename sort_type::storage_type sort; - typename discontinuity_type::storage_type discontinuity; - typename bit_keys_exchange_type::storage_type bit_keys_exchange; - typename values_exchange_type::storage_type values_exchange; - }; - - unsigned short starts[radix_size]; - unsigned short ends[radix_size]; - - unsigned int digit_starts[radix_size]; }; - template< - bool IsFull = false, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator - > - ROCPRIM_DEVICE inline - void sort_and_scatter(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int bit, - unsigned int current_radix_bits, - unsigned int digit_start, // i-th thread must pass i-th digit's value - storage_type& storage) + template + struct radix_sort_and_scatter_helper { - const unsigned int radix_mask = (1u << current_radix_bits) - 1; + static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + static constexpr unsigned int radix_size = 1 << RadixBits; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + using key_type = Key; + using value_type = Value; - if(flat_id < radix_size) + using key_codec = radix_key_codec; + using bit_key_type = typename key_codec::bit_key_type; + using keys_load_type + = ::rocprim::block_load; + using values_load_type + = ::rocprim::block_load; + using sort_type + = ::rocprim::block_radix_sort; + using discontinuity_type = ::rocprim::block_discontinuity; + using bit_keys_exchange_type + = ::rocprim::block_exchange; + using values_exchange_type + = ::rocprim::block_exchange; + + static constexpr bool with_values = !std::is_same::value; + + struct storage_type { - storage.digit_starts[flat_id] = digit_start; - } + union + { + typename keys_load_type::storage_type keys_load; + typename values_load_type::storage_type values_load; + typename sort_type::storage_type sort; + typename discontinuity_type::storage_type discontinuity; + typename bit_keys_exchange_type::storage_type bit_keys_exchange; + typename values_exchange_type::storage_type values_exchange; + }; + + unsigned short starts[radix_size]; + unsigned short ends[radix_size]; + + unsigned int digit_starts[radix_size]; + }; - for(unsigned int block_offset = begin_offset; block_offset < end_offset; block_offset += items_per_block) + template + ROCPRIM_DEVICE inline void + sort_and_scatter(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int bit, + unsigned int current_radix_bits, + unsigned int digit_start, // i-th thread must pass i-th digit's value + storage_type& storage) { - key_type keys[ItemsPerThread]; - value_type values[ItemsPerThread]; - unsigned int valid_count; - if(IsFull || (block_offset + items_per_block <= end_offset)) - { - valid_count = items_per_block; - keys_load_type().load(keys_input + block_offset, keys, storage.keys_load); - if(with_values) - { - ::rocprim::syncthreads(); - values_load_type().load(values_input + block_offset, values, storage.values_load); - } - } - else - { - valid_count = end_offset - block_offset; - // Sort will leave "invalid" (out of size) items at the end of the sorted sequence - const key_type out_of_bounds = key_codec::decode(bit_key_type(-1)); - keys_load_type().load(keys_input + block_offset, keys, valid_count, out_of_bounds, storage.keys_load); - if(with_values) - { - ::rocprim::syncthreads(); - values_load_type().load(values_input + block_offset, values, valid_count, storage.values_load); - } - } - bit_key_type bit_keys[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - bit_keys[i] = key_codec::encode(keys[i]); - } + const unsigned int radix_mask = (1u << current_radix_bits) - 1; + + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); if(flat_id < radix_size) { - storage.starts[flat_id] = valid_count; - storage.ends[flat_id] = valid_count; + storage.digit_starts[flat_id] = digit_start; } - ::rocprim::syncthreads(); - sort_block(sort_type(), bit_keys, values, storage.sort, bit, bit + current_radix_bits); - - unsigned int digits[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) + for(unsigned int block_offset = begin_offset; block_offset < end_offset; + block_offset += items_per_block) { - digits[i] = (bit_keys[i] >> bit) & radix_mask; - } + key_type keys[ItemsPerThread]; + value_type values[ItemsPerThread]; + unsigned int valid_count; + if(IsFull || (block_offset + items_per_block <= end_offset)) + { + valid_count = items_per_block; + keys_load_type().load(keys_input + block_offset, keys, storage.keys_load); + if(with_values) + { + ::rocprim::syncthreads(); + values_load_type().load( + values_input + block_offset, values, storage.values_load); + } + } + else + { + valid_count = end_offset - block_offset; + // Sort will leave "invalid" (out of size) items at the end of the sorted sequence + const key_type out_of_bounds = key_codec::decode(bit_key_type(-1)); + keys_load_type().load(keys_input + block_offset, + keys, + valid_count, + out_of_bounds, + storage.keys_load); + if(with_values) + { + ::rocprim::syncthreads(); + values_load_type().load( + values_input + block_offset, values, valid_count, storage.values_load); + } + } + bit_key_type bit_keys[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + bit_keys[i] = key_codec::encode(keys[i]); + } - bool head_flags[ItemsPerThread]; - bool tail_flags[ItemsPerThread]; - ::rocprim::not_equal_to flag_op; + if(flat_id < radix_size) + { + storage.starts[flat_id] = valid_count; + storage.ends[flat_id] = valid_count; + } - ::rocprim::syncthreads(); - discontinuity_type().flag_heads_and_tails(head_flags, tail_flags, digits, flag_op, storage.discontinuity); + ::rocprim::syncthreads(); + sort_block( + sort_type(), bit_keys, values, storage.sort, bit, bit + current_radix_bits); - // Fill start and end position of subsequence for every digit - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - const unsigned int digit = digits[i]; - const unsigned int pos = flat_id * ItemsPerThread + i; - if(head_flags[i]) + unsigned int digits[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) { - storage.starts[digit] = pos; + digits[i] = (bit_keys[i] >> bit) & radix_mask; } - if(tail_flags[i]) + + bool head_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; + ::rocprim::not_equal_to flag_op; + + ::rocprim::syncthreads(); + discontinuity_type().flag_heads_and_tails( + head_flags, tail_flags, digits, flag_op, storage.discontinuity); + + // Fill start and end position of subsequence for every digit + for(unsigned int i = 0; i < ItemsPerThread; i++) { - storage.ends[digit] = pos; + const unsigned int digit = digits[i]; + const unsigned int pos = flat_id * ItemsPerThread + i; + if(head_flags[i]) + { + storage.starts[digit] = pos; + } + if(tail_flags[i]) + { + storage.ends[digit] = pos; + } } - } - ::rocprim::syncthreads(); - // Rearrange to striped arrangement to have faster coalesced writes instead of - // scattering of blocked-arranged items - bit_keys_exchange_type().blocked_to_striped(bit_keys, bit_keys, storage.bit_keys_exchange); - if(with_values) - { ::rocprim::syncthreads(); - values_exchange_type().blocked_to_striped(values, values, storage.values_exchange); - } + // Rearrange to striped arrangement to have faster coalesced writes instead of + // scattering of blocked-arranged items + bit_keys_exchange_type().blocked_to_striped( + bit_keys, bit_keys, storage.bit_keys_exchange); + if(with_values) + { + ::rocprim::syncthreads(); + values_exchange_type().blocked_to_striped( + values, values, storage.values_exchange); + } - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - const unsigned int digit = (bit_keys[i] >> bit) & radix_mask; - const unsigned int pos = i * BlockSize + flat_id; - if(IsFull || (pos < valid_count)) + for(unsigned int i = 0; i < ItemsPerThread; i++) { - const unsigned int dst = pos - storage.starts[digit] + storage.digit_starts[digit]; - keys_output[dst] = key_codec::decode(bit_keys[i]); - if(with_values) + const unsigned int digit = (bit_keys[i] >> bit) & radix_mask; + const unsigned int pos = i * BlockSize + flat_id; + if(IsFull || (pos < valid_count)) { - values_output[dst] = values[i]; + const unsigned int dst + = pos - storage.starts[digit] + storage.digit_starts[digit]; + keys_output[dst] = key_codec::decode(bit_keys[i]); + if(with_values) + { + values_output[dst] = values[i]; + } } } - } - ::rocprim::syncthreads(); + ::rocprim::syncthreads(); - // Accumulate counts of the current block - if(flat_id < radix_size) - { - const unsigned int digit = flat_id; - const unsigned int start = storage.starts[digit]; - const unsigned int end = storage.ends[digit]; - if(start < valid_count) + // Accumulate counts of the current block + if(flat_id < radix_size) { - storage.digit_starts[digit] += (::rocprim::min(valid_count - 1, end) - start + 1); + const unsigned int digit = flat_id; + const unsigned int start = storage.starts[digit]; + const unsigned int end = storage.ends[digit]; + if(start < valid_count) + { + storage.digit_starts[digit] + += (::rocprim::min(valid_count - 1, end) - start + 1); + } } } } - } -}; - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits, - bool Descending, - class KeysInputIterator -> -ROCPRIM_DEVICE inline -void fill_digit_counts(KeysInputIterator keys_input, - unsigned int size, - unsigned int * batch_digit_counts, - unsigned int bit, - unsigned int current_radix_bits, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - constexpr unsigned int radix_size = 1 << RadixBits; - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + }; - using count_helper_type = radix_digit_count_helper; + template + ROCPRIM_DEVICE inline void fill_digit_counts(KeysInputIterator keys_input, + unsigned int size, + unsigned int* batch_digit_counts, + unsigned int bit, + unsigned int current_radix_bits, + unsigned int blocks_per_full_batch, + unsigned int full_batches) + { + constexpr unsigned int radix_size = 1 << RadixBits; + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - ROCPRIM_SHARED_MEMORY typename count_helper_type::storage_type storage; + using count_helper_type + = radix_digit_count_helper; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int batch_id = ::rocprim::detail::block_id<0>(); + ROCPRIM_SHARED_MEMORY typename count_helper_type::storage_type storage; - unsigned int block_offset; - unsigned int blocks_per_batch; - if(batch_id < full_batches) - { - blocks_per_batch = blocks_per_full_batch; - block_offset = batch_id * blocks_per_batch; - } - else - { - blocks_per_batch = blocks_per_full_batch - 1; - block_offset = batch_id * blocks_per_batch + full_batches; - } - block_offset *= items_per_block; + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int batch_id = ::rocprim::detail::block_id<0>(); - unsigned int digit_count; - if(batch_id < ::rocprim::detail::grid_size<0>() - 1) - { - count_helper_type().template count_digits( - keys_input, - block_offset, block_offset + blocks_per_batch * items_per_block, - bit, current_radix_bits, - storage, - digit_count - ); - } - else - { - count_helper_type().template count_digits( - keys_input, - block_offset, size, - bit, current_radix_bits, - storage, - digit_count - ); + unsigned int block_offset; + unsigned int blocks_per_batch; + if(batch_id < full_batches) + { + blocks_per_batch = blocks_per_full_batch; + block_offset = batch_id * blocks_per_batch; + } + else + { + blocks_per_batch = blocks_per_full_batch - 1; + block_offset = batch_id * blocks_per_batch + full_batches; + } + block_offset *= items_per_block; + + unsigned int digit_count; + if(batch_id < ::rocprim::detail::grid_size<0>() - 1) + { + count_helper_type().template count_digits( + keys_input, + block_offset, + block_offset + blocks_per_batch * items_per_block, + bit, + current_radix_bits, + storage, + digit_count); + } + else + { + count_helper_type().template count_digits( + keys_input, block_offset, size, bit, current_radix_bits, storage, digit_count); + } + + if(flat_id < radix_size) + { + batch_digit_counts[batch_id * radix_size + flat_id] = digit_count; + } } - if(flat_id < radix_size) + template + ROCPRIM_DEVICE inline void scan_batches(unsigned int* batch_digit_counts, + unsigned int* digit_counts, + unsigned int batches) { - batch_digit_counts[batch_id * radix_size + flat_id] = digit_count; - } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits -> -ROCPRIM_DEVICE inline -void scan_batches(unsigned int * batch_digit_counts, - unsigned int * digit_counts, - unsigned int batches) -{ - constexpr unsigned int radix_size = 1 << RadixBits; + constexpr unsigned int radix_size = 1 << RadixBits; - using scan_type = typename ::rocprim::block_scan; + using scan_type = typename ::rocprim::block_scan; - const unsigned int digit = ::rocprim::detail::block_id<0>(); - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int digit = ::rocprim::detail::block_id<0>(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - unsigned int values[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - const unsigned int batch_id = flat_id * ItemsPerThread + i; - values[i] = (batch_id < batches ? batch_digit_counts[batch_id * radix_size + digit] : 0); - } + unsigned int values[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + const unsigned int batch_id = flat_id * ItemsPerThread + i; + values[i] + = (batch_id < batches ? batch_digit_counts[batch_id * radix_size + digit] : 0); + } - unsigned int digit_count; - scan_type().exclusive_scan(values, values, 0, digit_count); + unsigned int digit_count; + scan_type().exclusive_scan(values, values, 0, digit_count); - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - const unsigned int batch_id = flat_id * ItemsPerThread + i; - if(batch_id < batches) + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + const unsigned int batch_id = flat_id * ItemsPerThread + i; + if(batch_id < batches) + { + batch_digit_counts[batch_id * radix_size + digit] = values[i]; + } + } + + if(flat_id == 0) { - batch_digit_counts[batch_id * radix_size + digit] = values[i]; + digit_counts[digit] = digit_count; } } - if(flat_id == 0) + template + ROCPRIM_DEVICE inline void scan_digits(unsigned int* digit_counts) { - digit_counts[digit] = digit_count; + constexpr unsigned int radix_size = 1 << RadixBits; + + using scan_type = typename ::rocprim::block_scan; + + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + + unsigned int value = digit_counts[flat_id]; + scan_type().exclusive_scan(value, value, 0); + digit_counts[flat_id] = value; } -} -template -ROCPRIM_DEVICE inline -void scan_digits(unsigned int * digit_counts) -{ - constexpr unsigned int radix_size = 1 << RadixBits; - - using scan_type = typename ::rocprim::block_scan; - - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - - unsigned int value = digit_counts[flat_id]; - scan_type().exclusive_scan(value, value, 0); - digit_counts[flat_id] = value; -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits, - bool Descending, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator -> -ROCPRIM_DEVICE inline -void sort_and_scatter(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int size, - const unsigned int * batch_digit_starts, - const unsigned int * digit_starts, - unsigned int bit, - unsigned int current_radix_bits, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - constexpr unsigned int radix_size = 1 << RadixBits; + template + ROCPRIM_DEVICE inline void sort_and_scatter(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int size, + const unsigned int* batch_digit_starts, + const unsigned int* digit_starts, + unsigned int bit, + unsigned int current_radix_bits, + unsigned int blocks_per_full_batch, + unsigned int full_batches) + { + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + constexpr unsigned int radix_size = 1 << RadixBits; - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; - using sort_and_scatter_helper = radix_sort_and_scatter_helper< - BlockSize, ItemsPerThread, RadixBits, Descending, - key_type, value_type - >; + using sort_and_scatter_helper = radix_sort_and_scatter_helper; - ROCPRIM_SHARED_MEMORY typename sort_and_scatter_helper::storage_type storage; + ROCPRIM_SHARED_MEMORY typename sort_and_scatter_helper::storage_type storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int batch_id = ::rocprim::detail::block_id<0>(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int batch_id = ::rocprim::detail::block_id<0>(); - unsigned int block_offset; - unsigned int blocks_per_batch; - if(batch_id < full_batches) - { - blocks_per_batch = blocks_per_full_batch; - block_offset = batch_id * blocks_per_batch; - } - else - { - blocks_per_batch = blocks_per_full_batch - 1; - block_offset = batch_id * blocks_per_batch + full_batches; - } - block_offset *= items_per_block; + unsigned int block_offset; + unsigned int blocks_per_batch; + if(batch_id < full_batches) + { + blocks_per_batch = blocks_per_full_batch; + block_offset = batch_id * blocks_per_batch; + } + else + { + blocks_per_batch = blocks_per_full_batch - 1; + block_offset = batch_id * blocks_per_batch + full_batches; + } + block_offset *= items_per_block; - unsigned int digit_start = 0; - if(flat_id < radix_size) - { - digit_start = digit_starts[flat_id] + batch_digit_starts[batch_id * radix_size + flat_id]; - } + unsigned int digit_start = 0; + if(flat_id < radix_size) + { + digit_start + = digit_starts[flat_id] + batch_digit_starts[batch_id * radix_size + flat_id]; + } - if(batch_id < ::rocprim::detail::grid_size<0>() - 1) - { - sort_and_scatter_helper().template sort_and_scatter( - keys_input, keys_output, values_input, values_output, - block_offset, block_offset + blocks_per_batch * items_per_block, - bit, current_radix_bits, - digit_start, - storage - ); - } - else - { - sort_and_scatter_helper().template sort_and_scatter( - keys_input, keys_output, values_input, values_output, - block_offset, size, - bit, current_radix_bits, - digit_start, - storage - ); + if(batch_id < ::rocprim::detail::grid_size<0>() - 1) + { + sort_and_scatter_helper().template sort_and_scatter( + keys_input, + keys_output, + values_input, + values_output, + block_offset, + block_offset + blocks_per_batch * items_per_block, + bit, + current_radix_bits, + digit_start, + storage); + } + else + { + sort_and_scatter_helper().template sort_and_scatter(keys_input, + keys_output, + values_input, + values_output, + block_offset, + size, + bit, + current_radix_bits, + digit_start, + storage); + } } -} } // end namespace detail diff --git a/rocprim/include/rocprim/device/detail/device_reduce.hpp b/rocprim/include/rocprim/device/detail/device_reduce.hpp index eb0f0c98a..e933a8f92 100644 --- a/rocprim/include/rocprim/device/detail/device_reduce.hpp +++ b/rocprim/include/rocprim/device/detail/device_reduce.hpp @@ -21,14 +21,14 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_REDUCE_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_REDUCE_HPP_ -#include #include +#include #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_load.hpp" @@ -39,143 +39,109 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Helper functions for reducing final value with -// initial value. -template< - bool WithInitialValue, - class T, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto reduce_with_initial(T output, - T initial_value, - BinaryFunction reduce_op) - -> typename std::enable_if::type -{ - return reduce_op(initial_value, output); -} - -template< - bool WithInitialValue, - class T, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto reduce_with_initial(T output, - T initial_value, - BinaryFunction reduce_op) - -> typename std::enable_if::type -{ - (void) initial_value; - (void) reduce_op; - return output; -} - -template< - bool WithInitialValue, - class Config, - class ResultType, - class InputIterator, - class OutputIterator, - class InitValueType, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void block_reduce_kernel_impl(InputIterator input, - const size_t input_size, - OutputIterator output, - InitValueType initial_value, - BinaryFunction reduce_op) -{ - constexpr unsigned int block_size = Config::block_size; - constexpr unsigned int items_per_thread = Config::items_per_thread; - - using result_type = ResultType; - - using block_reduce_type = ::rocprim::block_reduce< - result_type, block_size, - Config::block_reduce_method - >; - constexpr unsigned int items_per_block = block_size * items_per_thread; - - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - const unsigned int block_offset = flat_block_id * items_per_block; - const unsigned int number_of_blocks = ::rocprim::detail::grid_size<0>(); - auto valid_in_last_block = input_size - items_per_block * (number_of_blocks - 1); - - result_type values[items_per_thread]; - result_type output_value; - if(flat_block_id == (number_of_blocks - 1)) // last block + // Helper functions for reducing final value with + // initial value. + template + ROCPRIM_DEVICE inline auto + reduce_with_initial(T output, T initial_value, BinaryFunction reduce_op) -> + typename std::enable_if::type { - block_load_direct_striped( - flat_id, - input + block_offset, - values, - valid_in_last_block - ); - - output_value = values[0]; - #pragma unroll - for(unsigned int i = 1; i < items_per_thread; i++) + return reduce_op(initial_value, output); + } + + template + ROCPRIM_DEVICE inline auto + reduce_with_initial(T output, T initial_value, BinaryFunction reduce_op) -> + typename std::enable_if::type + { + (void)initial_value; + (void)reduce_op; + return output; + } + + template + ROCPRIM_DEVICE inline void block_reduce_kernel_impl(InputIterator input, + const size_t input_size, + OutputIterator output, + InitValueType initial_value, + BinaryFunction reduce_op) + { + constexpr unsigned int block_size = Config::block_size; + constexpr unsigned int items_per_thread = Config::items_per_thread; + + using result_type = ResultType; + + using block_reduce_type + = ::rocprim::block_reduce; + constexpr unsigned int items_per_block = block_size * items_per_thread; + + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + const unsigned int block_offset = flat_block_id * items_per_block; + const unsigned int number_of_blocks = ::rocprim::detail::grid_size<0>(); + auto valid_in_last_block = input_size - items_per_block * (number_of_blocks - 1); + + result_type values[items_per_thread]; + result_type output_value; + if(flat_block_id == (number_of_blocks - 1)) // last block { - unsigned int offset = i * block_size; - if(flat_id + offset < valid_in_last_block) + block_load_direct_striped( + flat_id, input + block_offset, values, valid_in_last_block); + + output_value = values[0]; +#pragma unroll + for(unsigned int i = 1; i < items_per_thread; i++) { - output_value = reduce_op(output_value, values[i]); + unsigned int offset = i * block_size; + if(flat_id + offset < valid_in_last_block) + { + output_value = reduce_op(output_value, values[i]); + } } + + block_reduce_type().reduce(output_value, // input + output_value, // output + valid_in_last_block, + reduce_op); } + else + { + block_load_direct_striped(flat_id, input + block_offset, values); - block_reduce_type() - .reduce( - output_value, // input - output_value, // output - valid_in_last_block, - reduce_op - ); - } - else - { - block_load_direct_striped( - flat_id, - input + block_offset, - values - ); - - // load input values into values - block_reduce_type() - .reduce( - values, // input - output_value, // output - reduce_op - ); - } + // load input values into values + block_reduce_type().reduce(values, // input + output_value, // output + reduce_op); + } - // Save value into output - if(flat_id == 0) - { - output[flat_block_id] = input_size == 0 - ? static_cast(initial_value) - : reduce_with_initial( - output_value, - static_cast(initial_value), - reduce_op - ); + // Save value into output + if(flat_id == 0) + { + output[flat_block_id] + = input_size == 0 + ? static_cast(initial_value) + : reduce_with_initial( + output_value, static_cast(initial_value), reduce_op); + } } -} -// Returns size of temporary storage in bytes. -template -size_t reduce_get_temporary_storage_bytes(size_t input_size, - size_t items_per_block) -{ - if(input_size <= items_per_block) + // Returns size of temporary storage in bytes. + template + size_t reduce_get_temporary_storage_bytes(size_t input_size, size_t items_per_block) { - return 0; + if(input_size <= items_per_block) + { + return 0; + } + auto size = (input_size + items_per_block - 1) / (items_per_block); + return size * sizeof(T) + reduce_get_temporary_storage_bytes(size, items_per_block); } - auto size = (input_size + items_per_block - 1)/(items_per_block); - return size * sizeof(T) + reduce_get_temporary_storage_bytes(size, items_per_block); -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp index 968023c6a..34aaf512e 100644 --- a/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp +++ b/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp @@ -27,603 +27,607 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../block/block_discontinuity.hpp" -#include "../../block/block_load_func.hpp" #include "../../block/block_load.hpp" -#include "../../block/block_store.hpp" +#include "../../block/block_load_func.hpp" #include "../../block/block_scan.hpp" +#include "../../block/block_store.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -struct carry_out -{ - ROCPRIM_DEVICE inline - carry_out& operator=(carry_out rhs) + template + struct carry_out { - value = rhs.value; - destination = rhs.destination; - next_has_carry_in = rhs.next_has_carry_in; - return *this; - } + ROCPRIM_DEVICE inline carry_out& operator=(carry_out rhs) + { + value = rhs.value; + destination = rhs.destination; + next_has_carry_in = rhs.next_has_carry_in; + return *this; + } - Value value; // carry-out of the current batch - unsigned int destination; - bool next_has_carry_in; // the next batch has carry-in (i.e. it continues the last segment from the current batch) -}; + Value value; // carry-out of the current batch + unsigned int destination; + bool + next_has_carry_in; // the next batch has carry-in (i.e. it continues the last segment from the current batch) + }; -template -struct scan_by_key_pair -{ - ROCPRIM_DEVICE inline - scan_by_key_pair& operator=(scan_by_key_pair rhs) + template + struct scan_by_key_pair { - key = rhs.key; - value = rhs.value; - return *this; - } - - unsigned int key; - Value value; -}; - -// Special operator which allows to calculate scan-by-key using block_scan. -// block_scan supports non-commutative scan operators. -// Initial values of pairs' keys must be 1 for the first item (head) of segment and 0 otherwise. -// As a result key contains the current segment's index and value contains segmented scan result. -template -struct scan_by_key_op -{ - BinaryFunction reduce_op; + ROCPRIM_DEVICE inline scan_by_key_pair& operator=(scan_by_key_pair rhs) + { + key = rhs.key; + value = rhs.value; + return *this; + } - ROCPRIM_DEVICE inline - scan_by_key_op(BinaryFunction reduce_op) - : reduce_op(reduce_op) - {} + unsigned int key; + Value value; + }; - ROCPRIM_DEVICE inline - Pair operator()(const Pair& a, const Pair& b) + // Special operator which allows to calculate scan-by-key using block_scan. + // block_scan supports non-commutative scan operators. + // Initial values of pairs' keys must be 1 for the first item (head) of segment and 0 otherwise. + // As a result key contains the current segment's index and value contains segmented scan result. + template + struct scan_by_key_op { - Pair c; - c.key = a.key + b.key; - c.value = b.key != 0 - ? b.value - : reduce_op(a.value, b.value); - return c; - } -}; + BinaryFunction reduce_op; -// Wrappers that reverse results of key comparizon functions to use them as flag_op of block_discontinuity -// (for example, equal_to will work as not_equal_to and divide items into segments by keys) -template -struct key_flag_op -{ - KeyCompareFunction key_compare_op; + ROCPRIM_DEVICE inline scan_by_key_op(BinaryFunction reduce_op) + : reduce_op(reduce_op) + { + } - ROCPRIM_DEVICE inline - key_flag_op(KeyCompareFunction key_compare_op) - : key_compare_op(key_compare_op) - {} + ROCPRIM_DEVICE inline Pair operator()(const Pair& a, const Pair& b) + { + Pair c; + c.key = a.key + b.key; + c.value = b.key != 0 ? b.value : reduce_op(a.value, b.value); + return c; + } + }; - ROCPRIM_DEVICE inline - bool operator()(const Key& a, const Key& b) + // Wrappers that reverse results of key comparizon functions to use them as flag_op of block_discontinuity + // (for example, equal_to will work as not_equal_to and divide items into segments by keys) + template + struct key_flag_op { - return !key_compare_op(a, b); - } -}; + KeyCompareFunction key_compare_op; -// This wrapper processes only part of items and flags (valid_count - 1)th item (for tails) -// and (valid_count)th item (for heads), all items after valid_count are unflagged. -template -struct guarded_key_flag_op -{ - KeyCompareFunction key_compare_op; - unsigned int valid_count; + ROCPRIM_DEVICE inline key_flag_op(KeyCompareFunction key_compare_op) + : key_compare_op(key_compare_op) + { + } - ROCPRIM_DEVICE inline - guarded_key_flag_op(KeyCompareFunction key_compare_op, unsigned int valid_count) - : key_compare_op(key_compare_op), valid_count(valid_count) - {} + ROCPRIM_DEVICE inline bool operator()(const Key& a, const Key& b) + { + return !key_compare_op(a, b); + } + }; - ROCPRIM_DEVICE inline - bool operator()(const Key& a, const Key& b, unsigned int b_index) + // This wrapper processes only part of items and flags (valid_count - 1)th item (for tails) + // and (valid_count)th item (for heads), all items after valid_count are unflagged. + template + struct guarded_key_flag_op { - return (b_index < valid_count && !key_compare_op(a, b)) || b_index == valid_count; - } -}; - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class KeysInputIterator, - class KeyCompareFunction -> -ROCPRIM_DEVICE inline -void fill_unique_counts(KeysInputIterator keys_input, - unsigned int size, - unsigned int * unique_counts, - KeyCompareFunction key_compare_op, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - constexpr unsigned int warp_size = ::rocprim::warp_size(); - constexpr unsigned int warps_no = BlockSize / warp_size; - - using key_type = typename std::iterator_traits::value_type; + KeyCompareFunction key_compare_op; + unsigned int valid_count; - using keys_load_type = ::rocprim::block_load< - key_type, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using discontinuity_type = ::rocprim::block_discontinuity; + ROCPRIM_DEVICE inline guarded_key_flag_op(KeyCompareFunction key_compare_op, + unsigned int valid_count) + : key_compare_op(key_compare_op) + , valid_count(valid_count) + { + } - ROCPRIM_SHARED_MEMORY struct - { - union + ROCPRIM_DEVICE inline bool operator()(const Key& a, const Key& b, unsigned int b_index) { - typename keys_load_type::storage_type keys_load; - typename discontinuity_type::storage_type discontinuity; - }; - unsigned int unique_counts[warps_no]; - } storage; - - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int batch_id = ::rocprim::detail::block_id<0>(); - const unsigned int lane_id = ::rocprim::lane_id(); - const unsigned int warp_id = ::rocprim::warp_id(); - - unsigned int block_offset; - unsigned int blocks_per_batch; - if(batch_id < full_batches) - { - blocks_per_batch = blocks_per_full_batch; - block_offset = batch_id * blocks_per_batch; - } - else + return (b_index < valid_count && !key_compare_op(a, b)) || b_index == valid_count; + } + }; + + template + ROCPRIM_DEVICE inline void fill_unique_counts(KeysInputIterator keys_input, + unsigned int size, + unsigned int* unique_counts, + KeyCompareFunction key_compare_op, + unsigned int blocks_per_full_batch, + unsigned int full_batches) { - blocks_per_batch = blocks_per_full_batch - 1; - block_offset = batch_id * blocks_per_batch + full_batches; - } - block_offset *= items_per_block; + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + constexpr unsigned int warp_size = ::rocprim::warp_size(); + constexpr unsigned int warps_no = BlockSize / warp_size; - unsigned int warp_unique_count = 0; + using key_type = typename std::iterator_traits::value_type; - for(unsigned int bi = 0; bi < blocks_per_batch; bi++) - { - const bool is_last_block = (block_offset + items_per_block >= size); + using keys_load_type + = ::rocprim::block_load; + using discontinuity_type = ::rocprim::block_discontinuity; - key_type keys[ItemsPerThread]; - unsigned int valid_count; - ::rocprim::syncthreads(); - if(is_last_block) + ROCPRIM_SHARED_MEMORY struct { - valid_count = size - block_offset; - keys_load_type().load(keys_input + block_offset, keys, valid_count, storage.keys_load); + union + { + typename keys_load_type::storage_type keys_load; + typename discontinuity_type::storage_type discontinuity; + }; + unsigned int unique_counts[warps_no]; + } storage; + + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int batch_id = ::rocprim::detail::block_id<0>(); + const unsigned int lane_id = ::rocprim::lane_id(); + const unsigned int warp_id = ::rocprim::warp_id(); + + unsigned int block_offset; + unsigned int blocks_per_batch; + if(batch_id < full_batches) + { + blocks_per_batch = blocks_per_full_batch; + block_offset = batch_id * blocks_per_batch; } else { - valid_count = items_per_block; - keys_load_type().load(keys_input + block_offset, keys, storage.keys_load); + blocks_per_batch = blocks_per_full_batch - 1; + block_offset = batch_id * blocks_per_batch + full_batches; } + block_offset *= items_per_block; - bool tail_flags[ItemsPerThread]; - key_type successor_key = keys[ItemsPerThread - 1]; - ::rocprim::syncthreads(); - if(is_last_block) - { - discontinuity_type().flag_tails( - tail_flags, successor_key, keys, - guarded_key_flag_op(key_compare_op, valid_count), - storage.discontinuity - ); - } - else + unsigned int warp_unique_count = 0; + + for(unsigned int bi = 0; bi < blocks_per_batch; bi++) { - if(flat_id == BlockSize - 1) + const bool is_last_block = (block_offset + items_per_block >= size); + + key_type keys[ItemsPerThread]; + unsigned int valid_count; + ::rocprim::syncthreads(); + if(is_last_block) + { + valid_count = size - block_offset; + keys_load_type().load( + keys_input + block_offset, keys, valid_count, storage.keys_load); + } + else { - successor_key = keys_input[block_offset + items_per_block]; + valid_count = items_per_block; + keys_load_type().load(keys_input + block_offset, keys, storage.keys_load); } - discontinuity_type().flag_tails( - tail_flags, successor_key, keys, - key_flag_op(key_compare_op), - storage.discontinuity - ); + + bool tail_flags[ItemsPerThread]; + key_type successor_key = keys[ItemsPerThread - 1]; + ::rocprim::syncthreads(); + if(is_last_block) + { + discontinuity_type().flag_tails( + tail_flags, + successor_key, + keys, + guarded_key_flag_op(key_compare_op, valid_count), + storage.discontinuity); + } + else + { + if(flat_id == BlockSize - 1) + { + successor_key = keys_input[block_offset + items_per_block]; + } + discontinuity_type().flag_tails( + tail_flags, + successor_key, + keys, + key_flag_op(key_compare_op), + storage.discontinuity); + } + + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + warp_unique_count += ::rocprim::bit_count(::rocprim::ballot(tail_flags[i])); + } + + block_offset += items_per_block; } - for(unsigned int i = 0; i < ItemsPerThread; i++) + if(lane_id == 0) { - warp_unique_count += ::rocprim::bit_count(::rocprim::ballot(tail_flags[i])); + storage.unique_counts[warp_id] = warp_unique_count; } + ::rocprim::syncthreads(); - block_offset += items_per_block; - } - - if(lane_id == 0) - { - storage.unique_counts[warp_id] = warp_unique_count; - } - ::rocprim::syncthreads(); - - if(flat_id == 0) - { - unsigned int batch_unique_count = 0; - for(unsigned int w = 0; w < warps_no; w++) + if(flat_id == 0) { - batch_unique_count += storage.unique_counts[w]; + unsigned int batch_unique_count = 0; + for(unsigned int w = 0; w < warps_no; w++) + { + batch_unique_count += storage.unique_counts[w]; + } + unique_counts[batch_id] = batch_unique_count; } - unique_counts[batch_id] = batch_unique_count; } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class UniqueCountOutputIterator -> -ROCPRIM_DEVICE inline -void scan_unique_counts(unsigned int * unique_counts, - UniqueCountOutputIterator unique_count_output, - unsigned int batches) -{ - using load_type = ::rocprim::block_load< - unsigned int, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using store_type = ::rocprim::block_store< - unsigned int, BlockSize, ItemsPerThread, - ::rocprim::block_store_method::block_store_transpose>; - using scan_type = typename ::rocprim::block_scan; - - ROCPRIM_SHARED_MEMORY union - { - typename load_type::storage_type load; - typename store_type::storage_type store; - typename scan_type::storage_type scan; - } storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - - unsigned int values[ItemsPerThread]; - load_type().load(unique_counts, values, batches, 0, storage.load); - - unsigned int unique_count; - ::rocprim::syncthreads(); - scan_type().exclusive_scan(values, values, 0, unique_count); - - ::rocprim::syncthreads(); - store_type().store(unique_counts, values, batches, storage.store); - - if(flat_id == 0) + template + ROCPRIM_DEVICE inline void scan_unique_counts(unsigned int* unique_counts, + UniqueCountOutputIterator unique_count_output, + unsigned int batches) { - *unique_count_output = unique_count; - } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class KeysInputIterator, - class ValuesInputIterator, - class Result, - class UniqueOutputIterator, - class AggregatesOutputIterator, - class KeyCompareFunction, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void reduce_by_key(KeysInputIterator keys_input, - ValuesInputIterator values_input, - unsigned int size, - const unsigned int * unique_starts, - carry_out * carry_outs, - Result * leading_aggregates, - UniqueOutputIterator unique_output, - AggregatesOutputIterator aggregates_output, - KeyCompareFunction key_compare_op, - BinaryFunction reduce_op, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + using load_type = ::rocprim::block_load; + using store_type + = ::rocprim::block_store; + using scan_type = typename ::rocprim::block_scan; + + ROCPRIM_SHARED_MEMORY union + { + typename load_type::storage_type load; + typename store_type::storage_type store; + typename scan_type::storage_type scan; + } storage; - using key_type = typename std::iterator_traits::value_type; - using result_type = Result; + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - using keys_load_type = ::rocprim::block_load< - key_type, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using values_load_type = ::rocprim::block_load< - result_type, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using discontinuity_type = ::rocprim::block_discontinuity; - using scan_type = ::rocprim::block_scan, BlockSize>; + unsigned int values[ItemsPerThread]; + load_type().load(unique_counts, values, batches, 0, storage.load); - ROCPRIM_SHARED_MEMORY struct - { - union - { - typename keys_load_type::storage_type keys_load; - typename values_load_type::storage_type values_load; - typename discontinuity_type::storage_type discontinuity; - typename scan_type::storage_type scan; - }; unsigned int unique_count; - bool has_carry_in; - detail::raw_storage carry_in; - } storage; - - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int batch_id = ::rocprim::detail::block_id<0>(); - - unsigned int block_offset; - unsigned int blocks_per_batch; - if(batch_id < full_batches) - { - blocks_per_batch = blocks_per_full_batch; - block_offset = batch_id * blocks_per_batch; - } - else - { - blocks_per_batch = blocks_per_full_batch - 1; - block_offset = batch_id * blocks_per_batch + full_batches; - } - block_offset *= items_per_block; + ::rocprim::syncthreads(); + scan_type().exclusive_scan(values, values, 0, unique_count); - const unsigned int batch_start = unique_starts[batch_id]; - unsigned int block_start = batch_start; + ::rocprim::syncthreads(); + store_type().store(unique_counts, values, batches, storage.store); - if(flat_id == 0) - { - storage.has_carry_in = - (block_offset > 0) && - key_compare_op(keys_input[block_offset - 1], keys_input[block_offset]); + if(flat_id == 0) + { + *unique_count_output = unique_count; + } } - for(unsigned int bi = 0; bi < blocks_per_batch; bi++) + template + ROCPRIM_DEVICE inline void reduce_by_key(KeysInputIterator keys_input, + ValuesInputIterator values_input, + unsigned int size, + const unsigned int* unique_starts, + carry_out* carry_outs, + Result* leading_aggregates, + UniqueOutputIterator unique_output, + AggregatesOutputIterator aggregates_output, + KeyCompareFunction key_compare_op, + BinaryFunction reduce_op, + unsigned int blocks_per_full_batch, + unsigned int full_batches) { - const bool is_last_block = (block_offset + items_per_block >= size); - - key_type keys[ItemsPerThread]; - result_type values[ItemsPerThread]; - unsigned int valid_count; - if(is_last_block) + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + + using key_type = typename std::iterator_traits::value_type; + using result_type = Result; + + using keys_load_type + = ::rocprim::block_load; + using values_load_type + = ::rocprim::block_load; + using discontinuity_type = ::rocprim::block_discontinuity; + using scan_type = ::rocprim::block_scan, BlockSize>; + + ROCPRIM_SHARED_MEMORY struct { - valid_count = size - block_offset; - keys_load_type().load(keys_input + block_offset, keys, valid_count, storage.keys_load); - ::rocprim::syncthreads(); - values_load_type().load(values_input + block_offset, values, valid_count, storage.values_load); + union + { + typename keys_load_type::storage_type keys_load; + typename values_load_type::storage_type values_load; + typename discontinuity_type::storage_type discontinuity; + typename scan_type::storage_type scan; + }; + unsigned int unique_count; + bool has_carry_in; + detail::raw_storage carry_in; + } storage; + + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int batch_id = ::rocprim::detail::block_id<0>(); + + unsigned int block_offset; + unsigned int blocks_per_batch; + if(batch_id < full_batches) + { + blocks_per_batch = blocks_per_full_batch; + block_offset = batch_id * blocks_per_batch; } else { - valid_count = items_per_block; - keys_load_type().load(keys_input + block_offset, keys, storage.keys_load); - ::rocprim::syncthreads(); - values_load_type().load(values_input + block_offset, values, storage.values_load); + blocks_per_batch = blocks_per_full_batch - 1; + block_offset = batch_id * blocks_per_batch + full_batches; } + block_offset *= items_per_block; - if(bi > 0 && flat_id == 0 && storage.has_carry_in) - { - // Apply carry-out of the previous block as carry-in for the first segment - values[0] = reduce_op(storage.carry_in.get(), values[0]); - } + const unsigned int batch_start = unique_starts[batch_id]; + unsigned int block_start = batch_start; - bool head_flags[ItemsPerThread]; - bool tail_flags[ItemsPerThread]; - key_type successor_key = keys[ItemsPerThread - 1]; - ::rocprim::syncthreads(); - if(is_last_block) + if(flat_id == 0) { - discontinuity_type().flag_heads_and_tails( - head_flags, tail_flags, successor_key, keys, - guarded_key_flag_op(key_compare_op, valid_count), - storage.discontinuity - ); + storage.has_carry_in + = (block_offset > 0) + && key_compare_op(keys_input[block_offset - 1], keys_input[block_offset]); } - else + + for(unsigned int bi = 0; bi < blocks_per_batch; bi++) { - if(flat_id == BlockSize - 1) + const bool is_last_block = (block_offset + items_per_block >= size); + + key_type keys[ItemsPerThread]; + result_type values[ItemsPerThread]; + unsigned int valid_count; + if(is_last_block) { - successor_key = keys_input[block_offset + items_per_block]; + valid_count = size - block_offset; + keys_load_type().load( + keys_input + block_offset, keys, valid_count, storage.keys_load); + ::rocprim::syncthreads(); + values_load_type().load( + values_input + block_offset, values, valid_count, storage.values_load); + } + else + { + valid_count = items_per_block; + keys_load_type().load(keys_input + block_offset, keys, storage.keys_load); + ::rocprim::syncthreads(); + values_load_type().load(values_input + block_offset, values, storage.values_load); } - discontinuity_type().flag_heads_and_tails( - head_flags, tail_flags, successor_key, keys, - key_flag_op(key_compare_op), - storage.discontinuity - ); - } - // Build pairs and run non-commutative inclusive scan to calculate scan-by-key - // and indices (ranks) of each segment: - // input: - // keys | 1 1 1 2 3 3 4 4 | - // head_flags | + + + + | - // values | 2 0 1 4 2 3 1 5 | - // result: - // scan values | 2 2 3 4 2 5 1 6 | - // scan keys | 1 1 1 2 3 3 4 4 | - // ranks (key-1) | 0 0 0 1 2 2 3 3 | - scan_by_key_pair pairs[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - pairs[i].key = head_flags[i]; - pairs[i].value = values[i]; - } - scan_by_key_op, BinaryFunction> scan_op(reduce_op); - ::rocprim::syncthreads(); - scan_type().inclusive_scan(pairs, pairs, storage.scan, scan_op); + if(bi > 0 && flat_id == 0 && storage.has_carry_in) + { + // Apply carry-out of the previous block as carry-in for the first segment + values[0] = reduce_op(storage.carry_in.get(), values[0]); + } - unsigned int ranks[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - ranks[i] = pairs[i].key - 1; // The first item is always flagged as head, so indices start from 1 - values[i] = pairs[i].value; - } + bool head_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; + key_type successor_key = keys[ItemsPerThread - 1]; + ::rocprim::syncthreads(); + if(is_last_block) + { + discontinuity_type().flag_heads_and_tails( + head_flags, + tail_flags, + successor_key, + keys, + guarded_key_flag_op(key_compare_op, valid_count), + storage.discontinuity); + } + else + { + if(flat_id == BlockSize - 1) + { + successor_key = keys_input[block_offset + items_per_block]; + } + discontinuity_type().flag_heads_and_tails( + head_flags, + tail_flags, + successor_key, + keys, + key_flag_op(key_compare_op), + storage.discontinuity); + } - if(flat_id == BlockSize - 1) - { - storage.unique_count = ranks[ItemsPerThread - 1] + (tail_flags[ItemsPerThread - 1] ? 1 : 0); - } + // Build pairs and run non-commutative inclusive scan to calculate scan-by-key + // and indices (ranks) of each segment: + // input: + // keys | 1 1 1 2 3 3 4 4 | + // head_flags | + + + + | + // values | 2 0 1 4 2 3 1 5 | + // result: + // scan values | 2 2 3 4 2 5 1 6 | + // scan keys | 1 1 1 2 3 3 4 4 | + // ranks (key-1) | 0 0 0 1 2 2 3 3 | + scan_by_key_pair pairs[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + pairs[i].key = head_flags[i]; + pairs[i].value = values[i]; + } + scan_by_key_op, BinaryFunction> scan_op(reduce_op); + ::rocprim::syncthreads(); + scan_type().inclusive_scan(pairs, pairs, storage.scan, scan_op); - ::rocprim::syncthreads(); - const unsigned int unique_count = storage.unique_count; - if(flat_id == 0) - { - // The first item must be written only if it is the first item of the current segment - // (otherwise it is written by one of previous blocks) - head_flags[0] = !storage.has_carry_in; - } - if(is_last_block) - { - // Unflag the head after the last segment as it will be written out of bounds + unsigned int ranks[ItemsPerThread]; for(unsigned int i = 0; i < ItemsPerThread; i++) { - if(ranks[i] >= unique_count) - { - head_flags[i] = false; - } + ranks[i] = pairs[i].key + - 1; // The first item is always flagged as head, so indices start from 1 + values[i] = pairs[i].value; } - } - ::rocprim::syncthreads(); - if(flat_id == BlockSize - 1) - { - if(bi == blocks_per_batch - 1) + if(flat_id == BlockSize - 1) { - // Save carry-out of the last block of the current batch - carry_outs[batch_id].value = values[ItemsPerThread - 1]; - carry_outs[batch_id].destination = block_start + ranks[ItemsPerThread - 1]; - carry_outs[batch_id].next_has_carry_in = !tail_flags[ItemsPerThread - 1]; + storage.unique_count + = ranks[ItemsPerThread - 1] + (tail_flags[ItemsPerThread - 1] ? 1 : 0); } - else + + ::rocprim::syncthreads(); + const unsigned int unique_count = storage.unique_count; + if(flat_id == 0) { - // Save carry-out to use it as carry-in for the next block of the current batch - storage.has_carry_in = !tail_flags[ItemsPerThread - 1]; - storage.carry_in.get() = values[ItemsPerThread - 1]; + // The first item must be written only if it is the first item of the current segment + // (otherwise it is written by one of previous blocks) + head_flags[0] = !storage.has_carry_in; } - } - if(batch_id > 0 && block_start == batch_start) - { - for(unsigned int i = 0; i < ItemsPerThread; i++) + if(is_last_block) { - // Write the scanned value of the last item of the first segment of the current batch - // (the leading possible incomplete aggregate) to calculate the final aggregate in the next kernel. - // The intermediate array is used instead of aggregates_output because - // aggregates_output may be write-only. - if(tail_flags[i] && ranks[i] == 0) + // Unflag the head after the last segment as it will be written out of bounds + for(unsigned int i = 0; i < ItemsPerThread; i++) { - leading_aggregates[batch_id - 1] = values[i]; + if(ranks[i] >= unique_count) + { + head_flags[i] = false; + } } } - } - // Save unique keys and aggregates (some aggregates contains partial values - // and will be updated later by calculating scan-by-key of carry-outs) - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - if(head_flags[i]) + ::rocprim::syncthreads(); + if(flat_id == BlockSize - 1) { - // Write the key of the first item of the segment as a unique key - unique_output[block_start + ranks[i]] = keys[i]; + if(bi == blocks_per_batch - 1) + { + // Save carry-out of the last block of the current batch + carry_outs[batch_id].value = values[ItemsPerThread - 1]; + carry_outs[batch_id].destination = block_start + ranks[ItemsPerThread - 1]; + carry_outs[batch_id].next_has_carry_in = !tail_flags[ItemsPerThread - 1]; + } + else + { + // Save carry-out to use it as carry-in for the next block of the current batch + storage.has_carry_in = !tail_flags[ItemsPerThread - 1]; + storage.carry_in.get() = values[ItemsPerThread - 1]; + } } - if(tail_flags[i]) + if(batch_id > 0 && block_start == batch_start) { - // Write the scanned value of the last item of the segment as an aggregate (reduction of the segment) - aggregates_output[block_start + ranks[i]] = values[i]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + // Write the scanned value of the last item of the first segment of the current batch + // (the leading possible incomplete aggregate) to calculate the final aggregate in the next kernel. + // The intermediate array is used instead of aggregates_output because + // aggregates_output may be write-only. + if(tail_flags[i] && ranks[i] == 0) + { + leading_aggregates[batch_id - 1] = values[i]; + } + } } - } - block_offset += items_per_block; - block_start += unique_count; - } -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class Result, - class AggregatesOutputIterator, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void scan_and_scatter_carry_outs(const carry_out * carry_outs, - const Result * leading_aggregates, - AggregatesOutputIterator aggregates_output, - BinaryFunction reduce_op, - unsigned int batches) -{ - using result_type = Result; + // Save unique keys and aggregates (some aggregates contains partial values + // and will be updated later by calculating scan-by-key of carry-outs) + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + if(head_flags[i]) + { + // Write the key of the first item of the segment as a unique key + unique_output[block_start + ranks[i]] = keys[i]; + } + if(tail_flags[i]) + { + // Write the scanned value of the last item of the segment as an aggregate (reduction of the segment) + aggregates_output[block_start + ranks[i]] = values[i]; + } + } - using discontinuity_type = ::rocprim::block_discontinuity; - using scan_type = ::rocprim::block_scan, BlockSize>; + block_offset += items_per_block; + block_start += unique_count; + } + } - ROCPRIM_SHARED_MEMORY struct + template + ROCPRIM_DEVICE inline void + scan_and_scatter_carry_outs(const carry_out* carry_outs, + const Result* leading_aggregates, + AggregatesOutputIterator aggregates_output, + BinaryFunction reduce_op, + unsigned int batches) { - typename discontinuity_type::storage_type discontinuity; - typename scan_type::storage_type scan; - } storage; + using result_type = Result; + + using discontinuity_type = ::rocprim::block_discontinuity; + using scan_type = ::rocprim::block_scan, BlockSize>; + + ROCPRIM_SHARED_MEMORY struct + { + typename discontinuity_type::storage_type discontinuity; + typename scan_type::storage_type scan; + } storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - carry_out cs[ItemsPerThread]; - block_load_direct_blocked(flat_id, carry_outs, cs, batches - 1); + carry_out cs[ItemsPerThread]; + block_load_direct_blocked(flat_id, carry_outs, cs, batches - 1); - unsigned int destinations[ItemsPerThread]; - result_type values[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - destinations[i] = cs[i].destination; - values[i] = cs[i].value; - } + unsigned int destinations[ItemsPerThread]; + result_type values[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + destinations[i] = cs[i].destination; + values[i] = cs[i].value; + } - bool head_flags[ItemsPerThread]; - bool tail_flags[ItemsPerThread]; - ::rocprim::equal_to compare_op; - // If a carry-out of the current batch has the same destination as previous batches, - // then we need to scan its value with values of those previous batches. - discontinuity_type().flag_heads_and_tails( - head_flags, tail_flags, - destinations[ItemsPerThread - 1], // Do not always flag the last item in the block - destinations, - guarded_key_flag_op(compare_op, batches - 1), - storage.discontinuity - ); - - scan_by_key_pair pairs[ItemsPerThread]; - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - pairs[i].key = head_flags[i]; - pairs[i].value = values[i]; - } + bool head_flags[ItemsPerThread]; + bool tail_flags[ItemsPerThread]; + ::rocprim::equal_to compare_op; + // If a carry-out of the current batch has the same destination as previous batches, + // then we need to scan its value with values of those previous batches. + discontinuity_type().flag_heads_and_tails( + head_flags, + tail_flags, + destinations[ItemsPerThread - 1], // Do not always flag the last item in the block + destinations, + guarded_key_flag_op(compare_op, batches - 1), + storage.discontinuity); - scan_by_key_op, BinaryFunction> scan_op(reduce_op); - scan_type().inclusive_scan(pairs, pairs, storage.scan, scan_op); + scan_by_key_pair pairs[ItemsPerThread]; + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + pairs[i].key = head_flags[i]; + pairs[i].value = values[i]; + } - // Scatter the last carry-out of each segment as carry-ins - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - if(tail_flags[i]) + scan_by_key_op, BinaryFunction> scan_op(reduce_op); + scan_type().inclusive_scan(pairs, pairs, storage.scan, scan_op); + + // Scatter the last carry-out of each segment as carry-ins + for(unsigned int i = 0; i < ItemsPerThread; i++) { - const unsigned int dst = destinations[i]; - const result_type aggregate = pairs[i].value; - if(cs[i].next_has_carry_in) - { - // The next batch continues the last segment from the current batch, - // combine two partial aggregates - aggregates_output[dst] = reduce_op(aggregate, leading_aggregates[flat_id * ItemsPerThread + i]); - } - else + if(tail_flags[i]) { - // Overwrite the aggregate because the next batch starts with a different key - aggregates_output[dst] = aggregate; + const unsigned int dst = destinations[i]; + const result_type aggregate = pairs[i].value; + if(cs[i].next_has_carry_in) + { + // The next batch continues the last segment from the current batch, + // combine two partial aggregates + aggregates_output[dst] + = reduce_op(aggregate, leading_aggregates[flat_id * ItemsPerThread + i]); + } + else + { + // Overwrite the aggregate because the next batch starts with a different key + aggregates_output[dst] = aggregate; + } } } } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_scan_lookback.hpp b/rocprim/include/rocprim/device/detail/device_scan_lookback.hpp index 5b6df0685..26944a619 100644 --- a/rocprim/include/rocprim/device/detail/device_scan_lookback.hpp +++ b/rocprim/include/rocprim/device/detail/device_scan_lookback.hpp @@ -21,17 +21,17 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_SCAN_LOOKBACK_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_SCAN_LOOKBACK_HPP_ -#include #include +#include #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_load.hpp" -#include "../../block/block_store.hpp" #include "../../block/block_scan.hpp" +#include "../../block/block_store.hpp" #include "lookback_scan_state.hpp" #include "ordered_block_id.hpp" @@ -45,273 +45,213 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -ROCPRIM_DEVICE inline -void init_lookback_scan_state_kernel_impl(LookBackScanState lookback_scan_state, - const unsigned int number_of_blocks, - ordered_block_id ordered_bid) -{ - const unsigned int block_id = ::rocprim::detail::block_id<0>(); - const unsigned int block_size = ::rocprim::detail::block_size<0>(); - const unsigned int block_thread_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int id = (block_id * block_size) + block_thread_id; - - // Reset ordered_block_id - if(id == 0) + template + ROCPRIM_DEVICE inline void + init_lookback_scan_state_kernel_impl(LookBackScanState lookback_scan_state, + const unsigned int number_of_blocks, + ordered_block_id ordered_bid) { - ordered_bid.reset(); + const unsigned int block_id = ::rocprim::detail::block_id<0>(); + const unsigned int block_size = ::rocprim::detail::block_size<0>(); + const unsigned int block_thread_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int id = (block_id * block_size) + block_thread_id; + + // Reset ordered_block_id + if(id == 0) + { + ordered_bid.reset(); + } + // Initialize lookback scan status + lookback_scan_state.initialize_prefix(id, number_of_blocks); } - // Initialize lookback scan status - lookback_scan_state.initialize_prefix(id, number_of_blocks); -} -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto lookback_block_scan(T (&values)[ItemsPerThread], - T /* initial_value */, - T& reduction, - typename BlockScan::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - BlockScan() - .inclusive_scan( - values, // input - values, // output - reduction, - storage, - scan_op - ); -} + template + ROCPRIM_DEVICE inline auto lookback_block_scan(T (&values)[ItemsPerThread], + T /* initial_value */, + T& reduction, + typename BlockScan::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type + { + BlockScan().inclusive_scan(values, // input + values, // output + reduction, + storage, + scan_op); + } -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto lookback_block_scan(T (&values)[ItemsPerThread], - T initial_value, - T& reduction, - typename BlockScan::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - BlockScan() - .exclusive_scan( - values, // input - values, // output - initial_value, - reduction, - storage, - scan_op - ); - reduction = scan_op(initial_value, reduction); -} + template + ROCPRIM_DEVICE inline auto lookback_block_scan(T (&values)[ItemsPerThread], + T initial_value, + T& reduction, + typename BlockScan::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type + { + BlockScan().exclusive_scan(values, // input + values, // output + initial_value, + reduction, + storage, + scan_op); + reduction = scan_op(initial_value, reduction); + } -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class PrefixCallback, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto lookback_block_scan(T (&values)[ItemsPerThread], - typename BlockScan::storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - BlockScan() - .inclusive_scan( - values, // input - values, // output - storage, - prefix_callback_op, - scan_op - ); -} + template + ROCPRIM_DEVICE inline auto lookback_block_scan(T (&values)[ItemsPerThread], + typename BlockScan::storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) -> + typename std::enable_if::type + { + BlockScan().inclusive_scan(values, // input + values, // output + storage, + prefix_callback_op, + scan_op); + } -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class PrefixCallback, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto lookback_block_scan(T (&values)[ItemsPerThread], - typename BlockScan::storage_type& storage, - PrefixCallback& prefix_callback_op, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - BlockScan() - .exclusive_scan( - values, // input - values, // output - storage, - prefix_callback_op, - scan_op - ); -} + template + ROCPRIM_DEVICE inline auto lookback_block_scan(T (&values)[ItemsPerThread], + typename BlockScan::storage_type& storage, + PrefixCallback& prefix_callback_op, + BinaryFunction scan_op) -> + typename std::enable_if::type + { + BlockScan().exclusive_scan(values, // input + values, // output + storage, + prefix_callback_op, + scan_op); + } -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class BinaryFunction, - class ResultType, - class LookbackScanState -> -ROCPRIM_DEVICE inline -void lookback_scan_kernel_impl(InputIterator input, - OutputIterator output, - const size_t size, - const ResultType initial_value, - BinaryFunction scan_op, - LookbackScanState scan_state, - const unsigned int number_of_blocks, - ordered_block_id ordered_bid) -{ - using result_type = ResultType; - static_assert( - std::is_same::value, - "value_type of LookbackScanState must be result_type" - ); + template + ROCPRIM_DEVICE inline void lookback_scan_kernel_impl(InputIterator input, + OutputIterator output, + const size_t size, + const ResultType initial_value, + BinaryFunction scan_op, + LookbackScanState scan_state, + const unsigned int number_of_blocks, + ordered_block_id ordered_bid) + { + using result_type = ResultType; + static_assert(std::is_same::value, + "value_type of LookbackScanState must be result_type"); - constexpr auto block_size = Config::block_size; - constexpr auto items_per_thread = Config::items_per_thread; - constexpr unsigned int items_per_block = block_size * items_per_thread; + constexpr auto block_size = Config::block_size; + constexpr auto items_per_thread = Config::items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; - using block_load_type = ::rocprim::block_load< - result_type, block_size, items_per_thread, - Config::block_load_method - >; - using block_store_type = ::rocprim::block_store< - result_type, block_size, items_per_thread, - Config::block_store_method - >; - using block_scan_type = ::rocprim::block_scan< - result_type, block_size, - Config::block_scan_method - >; + using block_load_type = ::rocprim:: + block_load; + using block_store_type = ::rocprim:: + block_store; + using block_scan_type + = ::rocprim::block_scan; - using order_bid_type = ordered_block_id; - using lookback_scan_prefix_op_type = lookback_scan_prefix_op< - result_type, BinaryFunction, LookbackScanState - >; + using order_bid_type = ordered_block_id; + using lookback_scan_prefix_op_type + = lookback_scan_prefix_op; - ROCPRIM_SHARED_MEMORY struct - { - typename order_bid_type::storage_type ordered_bid; - union + ROCPRIM_SHARED_MEMORY struct { - typename block_load_type::storage_type load; - typename block_store_type::storage_type store; - typename block_scan_type::storage_type scan; - }; - } storage; + typename order_bid_type::storage_type ordered_bid; + union + { + typename block_load_type::storage_type load; + typename block_store_type::storage_type store; + typename block_scan_type::storage_type scan; + }; + } storage; - const auto flat_block_thread_id = ::rocprim::flat_block_thread_id(); - const auto flat_block_id = ordered_bid.get(flat_block_thread_id, storage.ordered_bid); - const unsigned int block_offset = flat_block_id * items_per_block; - const auto valid_in_last_block = size - items_per_block * (number_of_blocks - 1); + const auto flat_block_thread_id = ::rocprim::flat_block_thread_id(); + const auto flat_block_id = ordered_bid.get(flat_block_thread_id, storage.ordered_bid); + const unsigned int block_offset = flat_block_id * items_per_block; + const auto valid_in_last_block = size - items_per_block * (number_of_blocks - 1); - // For input values - result_type values[items_per_thread]; + // For input values + result_type values[items_per_thread]; - // load input values into values - if(flat_block_id == (number_of_blocks - 1)) // last block - { - block_load_type() - .load( - input + block_offset, - values, - valid_in_last_block, - *(input + block_offset), - storage.load - ); - } - else - { - block_load_type() - .load( - input + block_offset, - values, - storage.load - ); - } - ::rocprim::syncthreads(); // sync threads to reuse shared memory + // load input values into values + if(flat_block_id == (number_of_blocks - 1)) // last block + { + block_load_type().load(input + block_offset, + values, + valid_in_last_block, + *(input + block_offset), + storage.load); + } + else + { + block_load_type().load(input + block_offset, values, storage.load); + } + ::rocprim::syncthreads(); // sync threads to reuse shared memory - if(flat_block_id == 0) - { - result_type reduction; - lookback_block_scan( - values, // input/output - initial_value, - reduction, - storage.scan, - scan_op - ); - if(flat_block_thread_id == 0) + if(flat_block_id == 0) { - scan_state.set_complete(flat_block_id, reduction); + result_type reduction; + lookback_block_scan(values, // input/output + initial_value, + reduction, + storage.scan, + scan_op); + if(flat_block_thread_id == 0) + { + scan_state.set_complete(flat_block_id, reduction); + } } - } - // Workaround: Fiji (gfx803) crashes with "Memory access fault by GPU node" on HCC 1.3.18482 (ROCm 2.0) - // Instead of just `} else {` we use `} syncthreads(); if() {`, because the else-branch can be executed - // for some unknown reason and 0-th block reads incorrect addresses in lookback_scan_prefix_op::get_prefix. - ::rocprim::syncthreads(); - if(flat_block_id > 0) - // original code: else - { - // Scan of block values - auto prefix_op = lookback_scan_prefix_op_type( - flat_block_id, scan_op, scan_state - ); - lookback_block_scan( - values, // input/output - storage.scan, - prefix_op, - scan_op - ); - } - ::rocprim::syncthreads(); // sync threads to reuse shared memory + // Workaround: Fiji (gfx803) crashes with "Memory access fault by GPU node" on HCC 1.3.18482 (ROCm 2.0) + // Instead of just `} else {` we use `} syncthreads(); if() {`, because the else-branch can be executed + // for some unknown reason and 0-th block reads incorrect addresses in lookback_scan_prefix_op::get_prefix. + ::rocprim::syncthreads(); + if(flat_block_id > 0) + // original code: else + { + // Scan of block values + auto prefix_op = lookback_scan_prefix_op_type(flat_block_id, scan_op, scan_state); + lookback_block_scan(values, // input/output + storage.scan, + prefix_op, + scan_op); + } + ::rocprim::syncthreads(); // sync threads to reuse shared memory - // Save values into output array - if(flat_block_id == (number_of_blocks - 1)) // last block - { - block_store_type() - .store( - output + block_offset, - values, - valid_in_last_block, - storage.store - ); - } - else - { - block_store_type() - .store( - output + block_offset, - values, - storage.store - ); + // Save values into output array + if(flat_block_id == (number_of_blocks - 1)) // last block + { + block_store_type().store( + output + block_offset, values, valid_in_last_block, storage.store); + } + else + { + block_store_type().store(output + block_offset, values, storage.store); + } } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_scan_reduce_then_scan.hpp b/rocprim/include/rocprim/device/detail/device_scan_reduce_then_scan.hpp index 775616b24..0a86ff216 100644 --- a/rocprim/include/rocprim/device/detail/device_scan_reduce_then_scan.hpp +++ b/rocprim/include/rocprim/device/detail/device_scan_reduce_then_scan.hpp @@ -21,419 +21,317 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_SCAN_REDUCE_THEN_SCAN_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_SCAN_REDUCE_THEN_SCAN_HPP_ -#include #include +#include #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_load.hpp" -#include "../../block/block_store.hpp" -#include "../../block/block_scan.hpp" #include "../../block/block_reduce.hpp" - +#include "../../block/block_scan.hpp" +#include "../../block/block_store.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Helper functions for performing exclusive or inclusive -// block scan in single_scan. -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto single_scan_block_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T initial_value, - typename BlockScan::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - BlockScan() - .exclusive_scan( - input, // input - output, // output - initial_value, - storage, - scan_op - ); -} - -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto single_scan_block_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T initial_value, - typename BlockScan::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - (void) initial_value; - BlockScan() - .inclusive_scan( - input, // input - output, // output - storage, - scan_op - ); -} - -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class BinaryFunction, - class ResultType -> -ROCPRIM_DEVICE inline -void single_scan_kernel_impl(InputIterator input, - const size_t input_size, - ResultType initial_value, - OutputIterator output, - BinaryFunction scan_op) -{ - constexpr unsigned int block_size = Config::block_size; - constexpr unsigned int items_per_thread = Config::items_per_thread; - - using result_type = ResultType; - - using block_load_type = ::rocprim::block_load< - result_type, block_size, items_per_thread, - Config::block_load_method - >; - using block_store_type = ::rocprim::block_store< - result_type, block_size, items_per_thread, - Config::block_store_method - >; - using block_scan_type = ::rocprim::block_scan< - result_type, block_size, - Config::block_scan_method - >; - - ROCPRIM_SHARED_MEMORY union - { - typename block_load_type::storage_type load; - typename block_store_type::storage_type store; - typename block_scan_type::storage_type scan; - } storage; - - result_type values[items_per_thread]; - // load input values into values - block_load_type() - .load( - input, - values, - input_size, - *(input), - storage.load - ); - ::rocprim::syncthreads(); // sync threads to reuse shared memory - - single_scan_block_scan( - values, // input - values, // output - initial_value, - storage.scan, - scan_op - ); - ::rocprim::syncthreads(); // sync threads to reuse shared memory - - // Save values into output array - block_store_type() - .store( - output, - values, - input_size, - storage.store - ); -} - -// Calculates block prefixes that will be used in final_scan -// when performing block scan operations. -template< - class Config, - class InputIterator, - class BinaryFunction, - class ResultType -> -ROCPRIM_DEVICE inline -void block_reduce_kernel_impl(InputIterator input, - BinaryFunction scan_op, - ResultType * block_prefixes) -{ - constexpr unsigned int block_size = Config::block_size; - constexpr unsigned int items_per_thread = Config::items_per_thread; - - using result_type = ResultType; - using block_reduce_type = ::rocprim::block_reduce< - result_type, block_size, - ::rocprim::block_reduce_algorithm::using_warp_reduce - >; - using block_load_type = ::rocprim::block_load< - result_type, block_size, items_per_thread, - Config::block_load_method - >; - - ROCPRIM_SHARED_MEMORY union - { - typename block_load_type::storage_type load; - typename block_reduce_type::storage_type reduce; - } storage; - - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - const unsigned int block_offset = flat_block_id * items_per_thread * block_size; - - // For input values - result_type values[items_per_thread]; - result_type block_prefix; - - block_load_type() - .load( - input + block_offset, - values, - storage.load - ); - ::rocprim::syncthreads(); // sync threads to reuse shared memory - - block_reduce_type() - .reduce( - values, // input - block_prefix, // output - storage.reduce, - scan_op - ); - - // Save block prefix - if(flat_id == 0) - { - block_prefixes[flat_block_id] = block_prefix; - } -} - -// Helper functions for performing exclusive or inclusive -// block scan operation in final_scan -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class ResultType, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto final_scan_block_scan(const unsigned int flat_block_id, - T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T initial_value, - ResultType * block_prefixes, - typename BlockScan::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - if(flat_block_id != 0) + // Helper functions for performing exclusive or inclusive + // block scan in single_scan. + template + ROCPRIM_DEVICE inline auto single_scan_block_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T initial_value, + typename BlockScan::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type { - // Include initial value in block prefix - initial_value = scan_op( - initial_value, block_prefixes[flat_block_id - 1] - ); + BlockScan().exclusive_scan(input, // input + output, // output + initial_value, + storage, + scan_op); } - BlockScan() - .exclusive_scan( - input, // input - output, // output - initial_value, - storage, - scan_op - ); -} - -template< - bool Exclusive, - class BlockScan, - class T, - unsigned int ItemsPerThread, - class ResultType, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto final_scan_block_scan(const unsigned int flat_block_id, - T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T initial_value, - ResultType * block_prefixes, - typename BlockScan::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - (void) initial_value; - if(flat_block_id == 0) + + template + ROCPRIM_DEVICE inline auto single_scan_block_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T initial_value, + typename BlockScan::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type { - BlockScan() - .inclusive_scan( - input, // input - output, // output - storage, - scan_op - ); + (void)initial_value; + BlockScan().inclusive_scan(input, // input + output, // output + storage, + scan_op); } - else + + template + ROCPRIM_DEVICE inline void single_scan_kernel_impl(InputIterator input, + const size_t input_size, + ResultType initial_value, + OutputIterator output, + BinaryFunction scan_op) { - auto block_prefix_op = - [&block_prefixes, &flat_block_id](const T& /*not used*/) - { - return block_prefixes[flat_block_id - 1]; - }; - BlockScan() - .inclusive_scan( - input, // input - output, // output - storage, - block_prefix_op, - scan_op - ); + constexpr unsigned int block_size = Config::block_size; + constexpr unsigned int items_per_thread = Config::items_per_thread; + + using result_type = ResultType; + + using block_load_type = ::rocprim:: + block_load; + using block_store_type = ::rocprim:: + block_store; + using block_scan_type + = ::rocprim::block_scan; + + ROCPRIM_SHARED_MEMORY union + { + typename block_load_type::storage_type load; + typename block_store_type::storage_type store; + typename block_scan_type::storage_type scan; + } storage; + + result_type values[items_per_thread]; + // load input values into values + block_load_type().load(input, values, input_size, *(input), storage.load); + ::rocprim::syncthreads(); // sync threads to reuse shared memory + + single_scan_block_scan(values, // input + values, // output + initial_value, + storage.scan, + scan_op); + ::rocprim::syncthreads(); // sync threads to reuse shared memory + + // Save values into output array + block_store_type().store(output, values, input_size, storage.store); } -} - -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class BinaryFunction, - class ResultType -> -ROCPRIM_DEVICE inline -void final_scan_kernel_impl(InputIterator input, - const size_t input_size, - OutputIterator output, - const ResultType initial_value, - BinaryFunction scan_op, - ResultType * block_prefixes) -{ - constexpr unsigned int block_size = Config::block_size; - constexpr unsigned int items_per_thread = Config::items_per_thread; - - using result_type = ResultType; - - using block_load_type = ::rocprim::block_load< - result_type, block_size, items_per_thread, - Config::block_load_method - >; - using block_store_type = ::rocprim::block_store< - result_type, block_size, items_per_thread, - Config::block_store_method - >; - using block_scan_type = ::rocprim::block_scan< - result_type, block_size, - Config::block_scan_method - >; - - ROCPRIM_SHARED_MEMORY union - { - typename block_load_type::storage_type load; - typename block_store_type::storage_type store; - typename block_scan_type::storage_type scan; - } storage; - - // It's assumed kernel is executed in 1D - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - - constexpr unsigned int items_per_block = block_size * items_per_thread; - const unsigned int block_offset = flat_block_id * items_per_block; - // TODO: number_of_blocks can be calculated on host - const unsigned int number_of_blocks = (input_size + items_per_block - 1)/items_per_block; - - // For input values - result_type values[items_per_thread]; - - // TODO: valid_in_last_block can be calculated on host - auto valid_in_last_block = input_size - items_per_block * (number_of_blocks - 1); - // load input values into values - if(flat_block_id == (number_of_blocks - 1)) // last block + + // Calculates block prefixes that will be used in final_scan + // when performing block scan operations. + template + ROCPRIM_DEVICE inline void block_reduce_kernel_impl(InputIterator input, + BinaryFunction scan_op, + ResultType* block_prefixes) { - block_load_type() - .load( - input + block_offset, - values, - valid_in_last_block, - *(input + block_offset), - storage.load - ); + constexpr unsigned int block_size = Config::block_size; + constexpr unsigned int items_per_thread = Config::items_per_thread; + + using result_type = ResultType; + using block_reduce_type + = ::rocprim::block_reduce; + using block_load_type = ::rocprim:: + block_load; + + ROCPRIM_SHARED_MEMORY union + { + typename block_load_type::storage_type load; + typename block_reduce_type::storage_type reduce; + } storage; + + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + const unsigned int block_offset = flat_block_id * items_per_thread * block_size; + + // For input values + result_type values[items_per_thread]; + result_type block_prefix; + + block_load_type().load(input + block_offset, values, storage.load); + ::rocprim::syncthreads(); // sync threads to reuse shared memory + + block_reduce_type().reduce(values, // input + block_prefix, // output + storage.reduce, + scan_op); + + // Save block prefix + if(flat_id == 0) + { + block_prefixes[flat_block_id] = block_prefix; + } } - else + + // Helper functions for performing exclusive or inclusive + // block scan operation in final_scan + template + ROCPRIM_DEVICE inline auto final_scan_block_scan(const unsigned int flat_block_id, + T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T initial_value, + ResultType* block_prefixes, + typename BlockScan::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type { - block_load_type() - .load( - input + block_offset, - values, - storage.load - ); + if(flat_block_id != 0) + { + // Include initial value in block prefix + initial_value = scan_op(initial_value, block_prefixes[flat_block_id - 1]); + } + BlockScan().exclusive_scan(input, // input + output, // output + initial_value, + storage, + scan_op); } - ::rocprim::syncthreads(); // sync threads to reuse shared memory - - final_scan_block_scan( - flat_block_id, - values, // input - values, // output - initial_value, - block_prefixes, - storage.scan, - scan_op - ); - ::rocprim::syncthreads(); // sync threads to reuse shared memory - - // Save values into output array - if(flat_block_id == (number_of_blocks - 1)) // last block + + template + ROCPRIM_DEVICE inline auto final_scan_block_scan(const unsigned int flat_block_id, + T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T initial_value, + ResultType* block_prefixes, + typename BlockScan::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type { - block_store_type() - .store( - output + block_offset, - values, - valid_in_last_block, - storage.store - ); + (void)initial_value; + if(flat_block_id == 0) + { + BlockScan().inclusive_scan(input, // input + output, // output + storage, + scan_op); + } + else + { + auto block_prefix_op = [&block_prefixes, &flat_block_id](const T& /*not used*/) { + return block_prefixes[flat_block_id - 1]; + }; + BlockScan().inclusive_scan(input, // input + output, // output + storage, + block_prefix_op, + scan_op); + } } - else + + template + ROCPRIM_DEVICE inline void final_scan_kernel_impl(InputIterator input, + const size_t input_size, + OutputIterator output, + const ResultType initial_value, + BinaryFunction scan_op, + ResultType* block_prefixes) { - block_store_type() - .store( - output + block_offset, - values, - storage.store - ); + constexpr unsigned int block_size = Config::block_size; + constexpr unsigned int items_per_thread = Config::items_per_thread; + + using result_type = ResultType; + + using block_load_type = ::rocprim:: + block_load; + using block_store_type = ::rocprim:: + block_store; + using block_scan_type + = ::rocprim::block_scan; + + ROCPRIM_SHARED_MEMORY union + { + typename block_load_type::storage_type load; + typename block_store_type::storage_type store; + typename block_scan_type::storage_type scan; + } storage; + + // It's assumed kernel is executed in 1D + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + + constexpr unsigned int items_per_block = block_size * items_per_thread; + const unsigned int block_offset = flat_block_id * items_per_block; + // TODO: number_of_blocks can be calculated on host + const unsigned int number_of_blocks = (input_size + items_per_block - 1) / items_per_block; + + // For input values + result_type values[items_per_thread]; + + // TODO: valid_in_last_block can be calculated on host + auto valid_in_last_block = input_size - items_per_block * (number_of_blocks - 1); + // load input values into values + if(flat_block_id == (number_of_blocks - 1)) // last block + { + block_load_type().load(input + block_offset, + values, + valid_in_last_block, + *(input + block_offset), + storage.load); + } + else + { + block_load_type().load(input + block_offset, values, storage.load); + } + ::rocprim::syncthreads(); // sync threads to reuse shared memory + + final_scan_block_scan(flat_block_id, + values, // input + values, // output + initial_value, + block_prefixes, + storage.scan, + scan_op); + ::rocprim::syncthreads(); // sync threads to reuse shared memory + + // Save values into output array + if(flat_block_id == (number_of_blocks - 1)) // last block + { + block_store_type().store( + output + block_offset, values, valid_in_last_block, storage.store); + } + else + { + block_store_type().store(output + block_offset, values, storage.store); + } } -} -// Returns size of temporary storage in bytes. -template -size_t scan_get_temporary_storage_bytes(size_t input_size, - size_t items_per_block) -{ - if(input_size <= items_per_block) + // Returns size of temporary storage in bytes. + template + size_t scan_get_temporary_storage_bytes(size_t input_size, size_t items_per_block) { - return 0; + if(input_size <= items_per_block) + { + return 0; + } + auto size = (input_size + items_per_block - 1) / (items_per_block); + return size * sizeof(T) + scan_get_temporary_storage_bytes(size, items_per_block); } - auto size = (input_size + items_per_block - 1)/(items_per_block); - return size * sizeof(T) + scan_get_temporary_storage_bytes(size, items_per_block); -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp index f00740a8a..3095fb7e4 100644 --- a/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp +++ b/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp @@ -21,14 +21,14 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_SEGMENTED_RADIX_SORT_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_SEGMENTED_RADIX_SORT_HPP_ -#include #include +#include #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_scan.hpp" @@ -40,539 +40,577 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class Key, - class Value, - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits, - bool Descending -> -class segmented_radix_sort_helper -{ - static constexpr unsigned int radix_size = 1 << RadixBits; - - using key_type = Key; - using value_type = Value; - - using count_helper_type = radix_digit_count_helper; - using scan_type = typename ::rocprim::block_scan; - using sort_and_scatter_helper = radix_sort_and_scatter_helper< - BlockSize, ItemsPerThread, RadixBits, Descending, - key_type, value_type>; - -public: - - union storage_type - { - typename count_helper_type::storage_type count_helper; - typename sort_and_scatter_helper::storage_type sort_and_scatter_helper; - }; - - template< - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator - > - ROCPRIM_DEVICE inline - void sort(KeysInputIterator keys_input, - key_type * keys_tmp, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - value_type * values_tmp, - ValuesOutputIterator values_output, - bool to_output, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int bit, - unsigned int begin_bit, - unsigned int end_bit, - storage_type& storage) + template + class segmented_radix_sort_helper { - // Handle cases when (end_bit - bit) is not divisible by radix_bits, i.e. the last - // iteration has a shorter mask. - const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit); + static constexpr unsigned int radix_size = 1 << RadixBits; + + using key_type = Key; + using value_type = Value; + + using count_helper_type + = radix_digit_count_helper; + using scan_type = typename ::rocprim::block_scan; + using sort_and_scatter_helper = radix_sort_and_scatter_helper; + + public: + union storage_type + { + typename count_helper_type::storage_type count_helper; + typename sort_and_scatter_helper::storage_type sort_and_scatter_helper; + }; + + template + ROCPRIM_DEVICE inline void sort(KeysInputIterator keys_input, + key_type* keys_tmp, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + value_type* values_tmp, + ValuesOutputIterator values_output, + bool to_output, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int bit, + unsigned int begin_bit, + unsigned int end_bit, + storage_type& storage) + { + // Handle cases when (end_bit - bit) is not divisible by radix_bits, i.e. the last + // iteration has a shorter mask. + const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit); - const bool is_first_iteration = (bit == begin_bit); + const bool is_first_iteration = (bit == begin_bit); - if(is_first_iteration) - { - if(to_output) + if(is_first_iteration) { - sort( - keys_input, keys_output, values_input, values_output, - begin_offset, end_offset, - bit, current_radix_bits, - storage - ); + if(to_output) + { + sort(keys_input, + keys_output, + values_input, + values_output, + begin_offset, + end_offset, + bit, + current_radix_bits, + storage); + } + else + { + sort(keys_input, + keys_tmp, + values_input, + values_tmp, + begin_offset, + end_offset, + bit, + current_radix_bits, + storage); + } } else { - sort( - keys_input, keys_tmp, values_input, values_tmp, - begin_offset, end_offset, - bit, current_radix_bits, - storage - ); + if(to_output) + { + sort(keys_tmp, + keys_output, + values_tmp, + values_output, + begin_offset, + end_offset, + bit, + current_radix_bits, + storage); + } + else + { + sort(keys_output, + keys_tmp, + values_output, + values_tmp, + begin_offset, + end_offset, + bit, + current_radix_bits, + storage); + } } } - else + + // When all iterators are raw pointers, this overload is used to minimize code duplication in the kernel + ROCPRIM_DEVICE inline void sort(key_type* keys_input, + key_type* keys_tmp, + key_type* keys_output, + value_type* values_input, + value_type* values_tmp, + value_type* values_output, + bool to_output, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int bit, + unsigned int begin_bit, + unsigned int end_bit, + storage_type& storage) { - if(to_output) + // Handle cases when (end_bit - bit) is not divisible by radix_bits, i.e. the last + // iteration has a shorter mask. + const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit); + + const bool is_first_iteration = (bit == begin_bit); + + key_type* current_keys_input; + key_type* current_keys_output; + value_type* current_values_input; + value_type* current_values_output; + if(is_first_iteration) { - sort( - keys_tmp, keys_output, values_tmp, values_output, - begin_offset, end_offset, - bit, current_radix_bits, - storage - ); + if(to_output) + { + current_keys_input = keys_input; + current_keys_output = keys_output; + current_values_input = values_input; + current_values_output = values_output; + } + else + { + current_keys_input = keys_input; + current_keys_output = keys_tmp; + current_values_input = values_input; + current_values_output = values_tmp; + } } else { - sort( - keys_output, keys_tmp, values_output, values_tmp, - begin_offset, end_offset, - bit, current_radix_bits, - storage - ); + if(to_output) + { + current_keys_input = keys_tmp; + current_keys_output = keys_output; + current_values_input = values_tmp; + current_values_output = values_output; + } + else + { + current_keys_input = keys_output; + current_keys_output = keys_tmp; + current_values_input = values_output; + current_values_output = values_tmp; + } } + sort(current_keys_input, + current_keys_output, + current_values_input, + current_values_output, + begin_offset, + end_offset, + bit, + current_radix_bits, + storage); } - } - // When all iterators are raw pointers, this overload is used to minimize code duplication in the kernel - ROCPRIM_DEVICE inline - void sort(key_type * keys_input, - key_type * keys_tmp, - key_type * keys_output, - value_type * values_input, - value_type * values_tmp, - value_type * values_output, - bool to_output, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int bit, - unsigned int begin_bit, - unsigned int end_bit, - storage_type& storage) - { - // Handle cases when (end_bit - bit) is not divisible by radix_bits, i.e. the last - // iteration has a shorter mask. - const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit); + private: + template + ROCPRIM_DEVICE inline void sort(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int bit, + unsigned int current_radix_bits, + storage_type& storage) + { + unsigned int digit_count; + count_helper_type().count_digits(keys_input, + begin_offset, + end_offset, + bit, + current_radix_bits, + storage.count_helper, + digit_count); + + unsigned int digit_start; + scan_type().exclusive_scan(digit_count, digit_start, 0); + digit_start += begin_offset; + + ::rocprim::syncthreads(); - const bool is_first_iteration = (bit == begin_bit); + sort_and_scatter_helper().sort_and_scatter(keys_input, + keys_output, + values_input, + values_output, + begin_offset, + end_offset, + bit, + current_radix_bits, + digit_start, + storage.sort_and_scatter_helper); - key_type * current_keys_input; - key_type * current_keys_output; - value_type * current_values_input; - value_type * current_values_output; - if(is_first_iteration) - { - if(to_output) - { - current_keys_input = keys_input; - current_keys_output = keys_output; - current_values_input = values_input; - current_values_output = values_output; - } - else - { - current_keys_input = keys_input; - current_keys_output = keys_tmp; - current_values_input = values_input; - current_values_output = values_tmp; - } + ::rocprim::syncthreads(); } - else + }; + + template + class segmented_radix_sort_single_block_helper + { + using key_type = Key; + using value_type = Value; + + using key_codec = radix_key_codec; + using bit_key_type = typename key_codec::bit_key_type; + using keys_load_type + = ::rocprim::block_load; + using values_load_type + = ::rocprim::block_load; + using sort_type + = ::rocprim::block_radix_sort; + using keys_store_type + = ::rocprim::block_store; + using values_store_type + = ::rocprim::block_store; + + static constexpr bool with_values = !std::is_same::value; + + public: + union storage_type + { + typename keys_load_type::storage_type keys_load; + typename values_load_type::storage_type values_load; + typename sort_type::storage_type sort; + typename keys_store_type::storage_type keys_store; + typename values_store_type::storage_type values_store; + }; + + template + ROCPRIM_DEVICE inline void sort(KeysInputIterator keys_input, + key_type* keys_tmp, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + value_type* values_tmp, + ValuesOutputIterator values_output, + bool to_output, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int begin_bit, + unsigned int end_bit, + storage_type& storage) { if(to_output) { - current_keys_input = keys_tmp; - current_keys_output = keys_output; - current_values_input = values_tmp; - current_values_output = values_output; + sort(keys_input, + keys_output, + values_input, + values_output, + begin_offset, + end_offset, + begin_bit, + end_bit, + storage); } else { - current_keys_input = keys_output; - current_keys_output = keys_tmp; - current_values_input = values_output; - current_values_output = values_tmp; + sort(keys_input, + keys_tmp, + values_input, + values_tmp, + begin_offset, + end_offset, + begin_bit, + end_bit, + storage); } } - sort( - current_keys_input, current_keys_output, current_values_input, current_values_output, - begin_offset, end_offset, - bit, current_radix_bits, - storage - ); - } - -private: - - template< - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator - > - ROCPRIM_DEVICE inline - void sort(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int bit, - unsigned int current_radix_bits, - storage_type& storage) - { - unsigned int digit_count; - count_helper_type().count_digits( - keys_input, - begin_offset, end_offset, - bit, current_radix_bits, - storage.count_helper, - digit_count - ); - - unsigned int digit_start; - scan_type().exclusive_scan(digit_count, digit_start, 0); - digit_start += begin_offset; - - ::rocprim::syncthreads(); - - sort_and_scatter_helper().sort_and_scatter( - keys_input, keys_output, values_input, values_output, - begin_offset, end_offset, - bit, current_radix_bits, - digit_start, - storage.sort_and_scatter_helper - ); - - ::rocprim::syncthreads(); - } -}; - -template< - class Key, - class Value, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool Descending -> -class segmented_radix_sort_single_block_helper -{ - using key_type = Key; - using value_type = Value; - - using key_codec = radix_key_codec; - using bit_key_type = typename key_codec::bit_key_type; - using keys_load_type = ::rocprim::block_load< - key_type, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using values_load_type = ::rocprim::block_load< - value_type, BlockSize, ItemsPerThread, - ::rocprim::block_load_method::block_load_transpose>; - using sort_type = ::rocprim::block_radix_sort; - using keys_store_type = ::rocprim::block_store< - key_type, BlockSize, ItemsPerThread, - ::rocprim::block_store_method::block_store_transpose>; - using values_store_type = ::rocprim::block_store< - value_type, BlockSize, ItemsPerThread, - ::rocprim::block_store_method::block_store_transpose>; - - static constexpr bool with_values = !std::is_same::value; - -public: - - union storage_type - { - typename keys_load_type::storage_type keys_load; - typename values_load_type::storage_type values_load; - typename sort_type::storage_type sort; - typename keys_store_type::storage_type keys_store; - typename values_store_type::storage_type values_store; - }; - template< - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator - > - ROCPRIM_DEVICE inline - void sort(KeysInputIterator keys_input, - key_type * keys_tmp, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - value_type * values_tmp, - ValuesOutputIterator values_output, - bool to_output, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int begin_bit, - unsigned int end_bit, - storage_type& storage) - { - if(to_output) + // When all iterators are raw pointers, this overload is used to minimize code duplication in the kernel + ROCPRIM_DEVICE inline void sort(key_type* keys_input, + key_type* keys_tmp, + key_type* keys_output, + value_type* values_input, + value_type* values_tmp, + value_type* values_output, + bool to_output, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int begin_bit, + unsigned int end_bit, + storage_type& storage) { - sort( - keys_input, keys_output, values_input, values_output, - begin_offset, end_offset, - begin_bit, end_bit, - storage - ); + sort(keys_input, + (to_output ? keys_output : keys_tmp), + values_input, + (to_output ? values_output : values_tmp), + begin_offset, + end_offset, + begin_bit, + end_bit, + storage); } - else + + template + ROCPRIM_DEVICE inline bool sort(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int begin_offset, + unsigned int end_offset, + unsigned int begin_bit, + unsigned int end_bit, + storage_type& storage) { - sort( - keys_input, keys_tmp, values_input, values_tmp, - begin_offset, end_offset, - begin_bit, end_bit, - storage - ); - } - } + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - // When all iterators are raw pointers, this overload is used to minimize code duplication in the kernel - ROCPRIM_DEVICE inline - void sort(key_type * keys_input, - key_type * keys_tmp, - key_type * keys_output, - value_type * values_input, - value_type * values_tmp, - value_type * values_output, - bool to_output, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int begin_bit, - unsigned int end_bit, - storage_type& storage) - { - sort( - keys_input, (to_output ? keys_output : keys_tmp), values_input, (to_output ? values_output : values_tmp), - begin_offset, end_offset, - begin_bit, end_bit, - storage - ); - } + using shorter_single_block_helper + = segmented_radix_sort_single_block_helper; - template< - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator - > - ROCPRIM_DEVICE inline - bool sort(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int begin_offset, - unsigned int end_offset, - unsigned int begin_bit, - unsigned int end_bit, - storage_type& storage) - { - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + // Segment is longer than supported by this function + if(end_offset - begin_offset > items_per_block) + { + return false; + } - using shorter_single_block_helper = segmented_radix_sort_single_block_helper< - key_type, value_type, - BlockSize, ItemsPerThread / 2, Descending - >; + // Recursively chech if it is possible to sort the segment using fewer items per thread + const bool processed_by_shorter = shorter_single_block_helper().sort( + keys_input, + keys_output, + values_input, + values_output, + begin_offset, + end_offset, + begin_bit, + end_bit, + reinterpret_cast(storage)); + if(processed_by_shorter) + { + return true; + } - // Segment is longer than supported by this function - if(end_offset - begin_offset > items_per_block) - { - return false; - } + key_type keys[ItemsPerThread]; + value_type values[ItemsPerThread]; + const unsigned int valid_count = end_offset - begin_offset; + // Sort will leave "invalid" (out of size) items at the end of the sorted sequence + const key_type out_of_bounds = key_codec::decode(bit_key_type(-1)); + keys_load_type().load( + keys_input + begin_offset, keys, valid_count, out_of_bounds, storage.keys_load); + if(with_values) + { + ::rocprim::syncthreads(); + values_load_type().load( + values_input + begin_offset, values, valid_count, storage.values_load); + } - // Recursively chech if it is possible to sort the segment using fewer items per thread - const bool processed_by_shorter = - shorter_single_block_helper().sort( - keys_input, keys_output, values_input, values_output, - begin_offset, end_offset, - begin_bit, end_bit, - reinterpret_cast(storage) - ); - if(processed_by_shorter) - { - return true; - } + ::rocprim::syncthreads(); + sort_block(sort_type(), keys, values, storage.sort, begin_bit, end_bit); - key_type keys[ItemsPerThread]; - value_type values[ItemsPerThread]; - const unsigned int valid_count = end_offset - begin_offset; - // Sort will leave "invalid" (out of size) items at the end of the sorted sequence - const key_type out_of_bounds = key_codec::decode(bit_key_type(-1)); - keys_load_type().load(keys_input + begin_offset, keys, valid_count, out_of_bounds, storage.keys_load); - if(with_values) - { ::rocprim::syncthreads(); - values_load_type().load(values_input + begin_offset, values, valid_count, storage.values_load); - } + keys_store_type().store( + keys_output + begin_offset, keys, valid_count, storage.keys_store); + if(with_values) + { + ::rocprim::syncthreads(); + values_store_type().store( + values_output + begin_offset, values, valid_count, storage.values_store); + } - ::rocprim::syncthreads(); - sort_block(sort_type(), keys, values, storage.sort, begin_bit, end_bit); + return true; + } + }; - ::rocprim::syncthreads(); - keys_store_type().store(keys_output + begin_offset, keys, valid_count, storage.keys_store); - if(with_values) + template + class segmented_radix_sort_single_block_helper + { + public: + struct storage_type { - ::rocprim::syncthreads(); - values_store_type().store(values_output + begin_offset, values, valid_count, storage.values_store); + }; + + template + ROCPRIM_DEVICE inline bool sort(KeysInputIterator, + KeysOutputIterator, + ValuesInputIterator, + ValuesOutputIterator, + unsigned int, + unsigned int, + unsigned int, + unsigned int, + storage_type&) + { + // It can't sort anything because ItemsPerThread is 0. + // The segment will be sorted by the calles (i.e. using ItemsPerThread = 1) + return false; } + }; - return true; - } -}; - -template< - class Key, - class Value, - unsigned int BlockSize, - bool Descending -> -class segmented_radix_sort_single_block_helper -{ -public: - - struct storage_type { }; - - template< - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator - > - ROCPRIM_DEVICE inline - bool sort(KeysInputIterator, - KeysOutputIterator, - ValuesInputIterator, - ValuesOutputIterator, - unsigned int, - unsigned int, - unsigned int, - unsigned int, - storage_type&) - { - // It can't sort anything because ItemsPerThread is 0. - // The segment will be sorted by the calles (i.e. using ItemsPerThread = 1) - return false; - } -}; - -template< - class Config, - bool Descending, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class OffsetIterator -> -ROCPRIM_DEVICE inline -void segmented_sort(KeysInputIterator keys_input, - typename std::iterator_traits::value_type * keys_tmp, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - typename std::iterator_traits::value_type * values_tmp, - ValuesOutputIterator values_output, - bool to_output, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int long_iterations, - unsigned int short_iterations, - unsigned int begin_bit, - unsigned int end_bit) -{ - constexpr unsigned int long_radix_bits = Config::long_radix_bits; - constexpr unsigned int short_radix_bits = Config::short_radix_bits; - constexpr unsigned int block_size = Config::sort::block_size; - constexpr unsigned int items_per_thread = Config::sort::items_per_thread; - constexpr unsigned int items_per_block = block_size * items_per_thread; - - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; - - using single_block_helper = segmented_radix_sort_single_block_helper< - key_type, value_type, - block_size, items_per_thread, - Descending - >; - using long_radix_helper_type = segmented_radix_sort_helper< - key_type, value_type, - block_size, items_per_thread, - long_radix_bits, Descending - >; - using short_radix_helper_type = segmented_radix_sort_helper< - key_type, value_type, - block_size, items_per_thread, - short_radix_bits, Descending - >; - - ROCPRIM_SHARED_MEMORY union + template + ROCPRIM_DEVICE inline void + segmented_sort(KeysInputIterator keys_input, + typename std::iterator_traits::value_type* keys_tmp, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + typename std::iterator_traits::value_type* values_tmp, + ValuesOutputIterator values_output, + bool to_output, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int long_iterations, + unsigned int short_iterations, + unsigned int begin_bit, + unsigned int end_bit) { - typename single_block_helper::storage_type single_block_helper; - typename long_radix_helper_type::storage_type long_radix_helper; - typename short_radix_helper_type::storage_type short_radix_helper; - } storage; + constexpr unsigned int long_radix_bits = Config::long_radix_bits; + constexpr unsigned int short_radix_bits = Config::short_radix_bits; + constexpr unsigned int block_size = Config::sort::block_size; + constexpr unsigned int items_per_thread = Config::sort::items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; + + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + + using single_block_helper = segmented_radix_sort_single_block_helper; + using long_radix_helper_type = segmented_radix_sort_helper; + using short_radix_helper_type = segmented_radix_sort_helper; + + ROCPRIM_SHARED_MEMORY union + { + typename single_block_helper::storage_type single_block_helper; + typename long_radix_helper_type::storage_type long_radix_helper; + typename short_radix_helper_type::storage_type short_radix_helper; + } storage; - const unsigned int segment_id = ::rocprim::detail::block_id<0>(); + const unsigned int segment_id = ::rocprim::detail::block_id<0>(); - const unsigned int begin_offset = begin_offsets[segment_id]; - const unsigned int end_offset = end_offsets[segment_id]; + const unsigned int begin_offset = begin_offsets[segment_id]; + const unsigned int end_offset = end_offsets[segment_id]; - // Empty segment - if(end_offset <= begin_offset) - { - return; - } + // Empty segment + if(end_offset <= begin_offset) + { + return; + } - if(end_offset - begin_offset > items_per_block) - { - // Long segment - unsigned int bit = begin_bit; - for(unsigned int i = 0; i < long_iterations; i++) + if(end_offset - begin_offset > items_per_block) { - long_radix_helper_type().sort( - keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, - to_output, - begin_offset, end_offset, - bit, begin_bit, end_bit, - storage.long_radix_helper - ); - - to_output = !to_output; - bit += long_radix_bits; + // Long segment + unsigned int bit = begin_bit; + for(unsigned int i = 0; i < long_iterations; i++) + { + long_radix_helper_type().sort(keys_input, + keys_tmp, + keys_output, + values_input, + values_tmp, + values_output, + to_output, + begin_offset, + end_offset, + bit, + begin_bit, + end_bit, + storage.long_radix_helper); + + to_output = !to_output; + bit += long_radix_bits; + } + for(unsigned int i = 0; i < short_iterations; i++) + { + short_radix_helper_type().sort(keys_input, + keys_tmp, + keys_output, + values_input, + values_tmp, + values_output, + to_output, + begin_offset, + end_offset, + bit, + begin_bit, + end_bit, + storage.short_radix_helper); + + to_output = !to_output; + bit += short_radix_bits; + } } - for(unsigned int i = 0; i < short_iterations; i++) + else { - short_radix_helper_type().sort( - keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, - to_output, - begin_offset, end_offset, - bit, begin_bit, end_bit, - storage.short_radix_helper - ); - - to_output = !to_output; - bit += short_radix_bits; + // Short segment + single_block_helper().sort(keys_input, + keys_tmp, + keys_output, + values_input, + values_tmp, + values_output, + ((long_iterations + short_iterations) % 2 == 0) != to_output, + begin_offset, + end_offset, + begin_bit, + end_bit, + storage.single_block_helper); } } - else - { - // Short segment - single_block_helper().sort( - keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, - ((long_iterations + short_iterations) % 2 == 0) != to_output, - begin_offset, end_offset, - begin_bit, end_bit, - storage.single_block_helper - ); - } -} } // end namespace detail diff --git a/rocprim/include/rocprim/device/detail/device_segmented_reduce.hpp b/rocprim/include/rocprim/device/detail/device_segmented_reduce.hpp index 996383e4f..1e1efe5c0 100644 --- a/rocprim/include/rocprim/device/detail/device_segmented_reduce.hpp +++ b/rocprim/include/rocprim/device/detail/device_segmented_reduce.hpp @@ -21,8 +21,8 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_SEGMENTED_REDUCE_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_SEGMENTED_REDUCE_HPP_ -#include #include +#include #include "../../config.hpp" #include "../../detail/various.hpp" @@ -38,126 +38,122 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class Config, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class ResultType, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void segmented_reduce(InputIterator input, - OutputIterator output, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - BinaryFunction reduce_op, - ResultType initial_value) -{ - constexpr unsigned int block_size = Config::block_size; - constexpr unsigned int items_per_thread = Config::items_per_thread; - constexpr unsigned int items_per_block = block_size * items_per_thread; + template + ROCPRIM_DEVICE inline void segmented_reduce(InputIterator input, + OutputIterator output, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + BinaryFunction reduce_op, + ResultType initial_value) + { + constexpr unsigned int block_size = Config::block_size; + constexpr unsigned int items_per_thread = Config::items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; - using reduce_type = ::rocprim::block_reduce< - ResultType, block_size, - Config::block_reduce_method - >; + using reduce_type + = ::rocprim::block_reduce; - ROCPRIM_SHARED_MEMORY typename reduce_type::storage_type reduce_storage; + ROCPRIM_SHARED_MEMORY typename reduce_type::storage_type reduce_storage; - const unsigned int flat_id = ::rocprim::flat_block_thread_id(); - const unsigned int segment_id = ::rocprim::detail::block_id<0>(); + const unsigned int flat_id = ::rocprim::flat_block_thread_id(); + const unsigned int segment_id = ::rocprim::detail::block_id<0>(); - const unsigned int begin_offset = begin_offsets[segment_id]; - const unsigned int end_offset = end_offsets[segment_id]; + const unsigned int begin_offset = begin_offsets[segment_id]; + const unsigned int end_offset = end_offsets[segment_id]; - // Empty segment - if(end_offset <= begin_offset) - { - if(flat_id == 0) + // Empty segment + if(end_offset <= begin_offset) { - output[segment_id] = initial_value; + if(flat_id == 0) + { + output[segment_id] = initial_value; + } + return; } - return; - } - - ResultType result; - unsigned int block_offset = begin_offset; - if(block_offset + items_per_block > end_offset) - { - // Segment is shorter than items_per_block - // Load the partial block and reduce the current thread's values - const unsigned int valid_count = end_offset - block_offset; - if(flat_id < valid_count) + ResultType result; + unsigned int block_offset = begin_offset; + if(block_offset + items_per_block > end_offset) { - unsigned int offset = block_offset + flat_id; - result = input[offset]; - offset += block_size; - while(offset < end_offset) + // Segment is shorter than items_per_block + + // Load the partial block and reduce the current thread's values + const unsigned int valid_count = end_offset - block_offset; + if(flat_id < valid_count) { - result = reduce_op(result, static_cast(input[offset])); + unsigned int offset = block_offset + flat_id; + result = input[offset]; offset += block_size; + while(offset < end_offset) + { + result = reduce_op(result, static_cast(input[offset])); + offset += block_size; + } } - } - // Reduce threads' reductions to compute the final result - if(valid_count >= block_size) - { - // All threads have at least one value, i.e. result has valid value - reduce_type().reduce(result, result, reduce_storage, reduce_op); + // Reduce threads' reductions to compute the final result + if(valid_count >= block_size) + { + // All threads have at least one value, i.e. result has valid value + reduce_type().reduce(result, result, reduce_storage, reduce_op); + } + else + { + reduce_type().reduce(result, result, valid_count, reduce_storage, reduce_op); + } } else { - reduce_type().reduce(result, result, valid_count, reduce_storage, reduce_op); - } - } - else - { - // Long segments - - ResultType values[items_per_thread]; + // Long segments - // Load the first block and reduce the current thread's values - block_load_direct_striped(flat_id, input + block_offset, values); - result = values[0]; - for(unsigned int i = 1; i < items_per_thread; i++) - { - result = reduce_op(result, values[i]); - } - block_offset += items_per_block; + ResultType values[items_per_thread]; - // Load next full blocks and continue reduction - while(block_offset + items_per_block < end_offset) - { + // Load the first block and reduce the current thread's values block_load_direct_striped(flat_id, input + block_offset, values); - for(unsigned int i = 0; i < items_per_thread; i++) + result = values[0]; + for(unsigned int i = 1; i < items_per_thread; i++) { result = reduce_op(result, values[i]); } block_offset += items_per_block; - } - // Load the last (probably partial) block and continue reduction - const unsigned int valid_count = end_offset - block_offset; - block_load_direct_striped(flat_id, input + block_offset, values, valid_count); - for(unsigned int i = 0; i < items_per_thread; i++) - { - if(i * block_size + flat_id < valid_count) + // Load next full blocks and continue reduction + while(block_offset + items_per_block < end_offset) { - result = reduce_op(result, values[i]); + block_load_direct_striped(flat_id, input + block_offset, values); + for(unsigned int i = 0; i < items_per_thread; i++) + { + result = reduce_op(result, values[i]); + } + block_offset += items_per_block; } - } - // Reduce threads' reductions to compute the final result - reduce_type().reduce(result, result, reduce_storage, reduce_op); - } + // Load the last (probably partial) block and continue reduction + const unsigned int valid_count = end_offset - block_offset; + block_load_direct_striped( + flat_id, input + block_offset, values, valid_count); + for(unsigned int i = 0; i < items_per_thread; i++) + { + if(i * block_size + flat_id < valid_count) + { + result = reduce_op(result, values[i]); + } + } - if(flat_id == 0) - { - output[segment_id] = reduce_op(initial_value, result); + // Reduce threads' reductions to compute the final result + reduce_type().reduce(result, result, reduce_storage, reduce_op); + } + + if(flat_id == 0) + { + output[segment_id] = reduce_op(initial_value, result); + } } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_segmented_scan.hpp b/rocprim/include/rocprim/device/detail/device_segmented_scan.hpp index 01ee9b303..473029d61 100644 --- a/rocprim/include/rocprim/device/detail/device_segmented_scan.hpp +++ b/rocprim/include/rocprim/device/detail/device_segmented_scan.hpp @@ -21,213 +21,180 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_SEGMENTED_SCAN_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_SEGMENTED_SCAN_HPP_ -#include #include +#include #include "../../config.hpp" #include "../../intrinsics.hpp" #include "../../types.hpp" -#include "../../detail/various.hpp" #include "../../detail/binary_op_wrappers.hpp" +#include "../../detail/various.hpp" #include "../../block/block_load.hpp" -#include "../../block/block_store.hpp" #include "../../block/block_scan.hpp" +#include "../../block/block_store.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - bool Exclusive, - bool UsePrefix, - class BlockScanType, - class T, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto segmented_scan_block_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& prefix, - typename BlockScanType::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - auto prefix_op = - [&prefix, scan_op](const T& reduction) - { + template + ROCPRIM_DEVICE inline auto + segmented_scan_block_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& prefix, + typename BlockScanType::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type + { + auto prefix_op = [&prefix, scan_op](const T& reduction) { auto saved_prefix = prefix; - prefix = scan_op(prefix, reduction); + prefix = scan_op(prefix, reduction); return saved_prefix; }; - BlockScanType() - .exclusive_scan( - input, output, - storage, prefix_op, scan_op - ); -} - -template< - bool Exclusive, - bool UsePrefix, - class BlockScanType, - class T, - unsigned int ItemsPerThread, - class BinaryFunction -> -ROCPRIM_DEVICE inline -auto segmented_scan_block_scan(T (&input)[ItemsPerThread], - T (&output)[ItemsPerThread], - T& prefix, - typename BlockScanType::storage_type& storage, - BinaryFunction scan_op) - -> typename std::enable_if::type -{ - if(UsePrefix) + BlockScanType().exclusive_scan(input, output, storage, prefix_op, scan_op); + } + + template + ROCPRIM_DEVICE inline auto + segmented_scan_block_scan(T (&input)[ItemsPerThread], + T (&output)[ItemsPerThread], + T& prefix, + typename BlockScanType::storage_type& storage, + BinaryFunction scan_op) -> + typename std::enable_if::type { - auto prefix_op = - [&prefix, scan_op](const T& reduction) - { + if(UsePrefix) + { + auto prefix_op = [&prefix, scan_op](const T& reduction) { auto saved_prefix = prefix; - prefix = scan_op(prefix, reduction); + prefix = scan_op(prefix, reduction); return saved_prefix; }; - BlockScanType() - .inclusive_scan( - input, output, - storage, prefix_op, scan_op - ); - return; + BlockScanType().inclusive_scan(input, output, storage, prefix_op, scan_op); + return; + } + BlockScanType().inclusive_scan(input, output, prefix, storage, scan_op); } - BlockScanType() - .inclusive_scan( - input, output, prefix, - storage, scan_op - ); -} - -template< - bool Exclusive, - class Config, - class ResultType, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class InitValueType, - class BinaryFunction -> -ROCPRIM_DEVICE inline -void segmented_scan(InputIterator input, - OutputIterator output, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - InitValueType initial_value, - BinaryFunction scan_op) -{ - constexpr auto block_size = Config::block_size; - constexpr auto items_per_thread = Config::items_per_thread; - constexpr unsigned int items_per_block = block_size * items_per_thread; - - using result_type = ResultType; - using block_load_type = ::rocprim::block_load< - result_type, block_size, items_per_thread, - Config::block_load_method - >; - using block_store_type = ::rocprim::block_store< - result_type, block_size, items_per_thread, - Config::block_store_method - >; - using block_scan_type = ::rocprim::block_scan< - result_type, block_size, - Config::block_scan_method - >; - - ROCPRIM_SHARED_MEMORY union + + template + ROCPRIM_DEVICE inline void segmented_scan(InputIterator input, + OutputIterator output, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + InitValueType initial_value, + BinaryFunction scan_op) { - typename block_load_type::storage_type load; - typename block_store_type::storage_type store; - typename block_scan_type::storage_type scan; - } storage; + constexpr auto block_size = Config::block_size; + constexpr auto items_per_thread = Config::items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; + + using result_type = ResultType; + using block_load_type = ::rocprim:: + block_load; + using block_store_type = ::rocprim:: + block_store; + using block_scan_type + = ::rocprim::block_scan; + + ROCPRIM_SHARED_MEMORY union + { + typename block_load_type::storage_type load; + typename block_store_type::storage_type store; + typename block_scan_type::storage_type scan; + } storage; - const unsigned int segment_id = ::rocprim::detail::block_id<0>(); - const unsigned int begin_offset = begin_offsets[segment_id]; - const unsigned int end_offset = end_offsets[segment_id]; + const unsigned int segment_id = ::rocprim::detail::block_id<0>(); + const unsigned int begin_offset = begin_offsets[segment_id]; + const unsigned int end_offset = end_offsets[segment_id]; - // Empty segment - if(end_offset <= begin_offset) - { - return; - } + // Empty segment + if(end_offset <= begin_offset) + { + return; + } - // Input values - result_type values[items_per_thread]; - result_type prefix = initial_value; + // Input values + result_type values[items_per_thread]; + result_type prefix = initial_value; - unsigned int block_offset = begin_offset; - if(block_offset + items_per_block > end_offset) - { - // Segment is shorter than items_per_block - - // Load the partial block - const unsigned int valid_count = end_offset - block_offset; - block_load_type().load(input + block_offset, values, valid_count, storage.load); - ::rocprim::syncthreads(); - // Perform scan operation - segmented_scan_block_scan( - values, values, prefix, storage.scan, scan_op - ); - ::rocprim::syncthreads(); - // Store the partial block - block_store_type().store(output + block_offset, values, valid_count, storage.store); - } - else - { - // Long segments - - // Load the first block of input values - block_load_type().load(input + block_offset, values, storage.load); - ::rocprim::syncthreads(); - // Perform scan operation - segmented_scan_block_scan( - values, values, prefix, storage.scan, scan_op - ); - ::rocprim::syncthreads(); - // Store - block_store_type().store(output + block_offset, values, storage.store); - ::rocprim::syncthreads(); - block_offset += items_per_block; - - // Load next full blocks and continue scanning - while(block_offset + items_per_block < end_offset) + unsigned int block_offset = begin_offset; + if(block_offset + items_per_block > end_offset) + { + // Segment is shorter than items_per_block + + // Load the partial block + const unsigned int valid_count = end_offset - block_offset; + block_load_type().load(input + block_offset, values, valid_count, storage.load); + ::rocprim::syncthreads(); + // Perform scan operation + segmented_scan_block_scan( + values, values, prefix, storage.scan, scan_op); + ::rocprim::syncthreads(); + // Store the partial block + block_store_type().store(output + block_offset, values, valid_count, storage.store); + } + else { + // Long segments + + // Load the first block of input values block_load_type().load(input + block_offset, values, storage.load); ::rocprim::syncthreads(); // Perform scan operation - segmented_scan_block_scan( - values, values, prefix, storage.scan, scan_op - ); + segmented_scan_block_scan( + values, values, prefix, storage.scan, scan_op); ::rocprim::syncthreads(); + // Store block_store_type().store(output + block_offset, values, storage.store); ::rocprim::syncthreads(); block_offset += items_per_block; - } - // Load the last (probably partial) block and continue scanning - const unsigned int valid_count = end_offset - block_offset; - block_load_type().load(input + block_offset, values, valid_count, storage.load); - ::rocprim::syncthreads(); - // Perform scan operation - segmented_scan_block_scan( - values, values, prefix, storage.scan, scan_op - ); - ::rocprim::syncthreads(); - // Store the partial block - block_store_type().store(output + block_offset, values, valid_count, storage.store); + // Load next full blocks and continue scanning + while(block_offset + items_per_block < end_offset) + { + block_load_type().load(input + block_offset, values, storage.load); + ::rocprim::syncthreads(); + // Perform scan operation + segmented_scan_block_scan( + values, values, prefix, storage.scan, scan_op); + ::rocprim::syncthreads(); + block_store_type().store(output + block_offset, values, storage.store); + ::rocprim::syncthreads(); + block_offset += items_per_block; + } + + // Load the last (probably partial) block and continue scanning + const unsigned int valid_count = end_offset - block_offset; + block_load_type().load(input + block_offset, values, valid_count, storage.load); + ::rocprim::syncthreads(); + // Perform scan operation + segmented_scan_block_scan( + values, values, prefix, storage.scan, scan_op); + ::rocprim::syncthreads(); + // Store the partial block + block_store_type().store(output + block_offset, values, valid_count, storage.store); + } } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/device_transform.hpp b/rocprim/include/rocprim/device/detail/device_transform.hpp index 3665daded..c302eec8c 100644 --- a/rocprim/include/rocprim/device/detail/device_transform.hpp +++ b/rocprim/include/rocprim/device/detail/device_transform.hpp @@ -21,15 +21,15 @@ #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_TRANSFORM_HPP_ #define ROCPRIM_DEVICE_DETAIL_DEVICE_TRANSFORM_HPP_ -#include #include +#include #include "../../config.hpp" -#include "../../detail/various.hpp" #include "../../detail/match_result_type.hpp" +#include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" #include "../../types.hpp" #include "../../block/block_load.hpp" @@ -40,112 +40,88 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Wrapper for unpacking tuple to be used with BinaryFunction. -// See transform function which accepts two input iterators. -template -struct unpack_binary_op -{ - using result_type = typename ::rocprim::detail::invoke_result::type; + // Wrapper for unpacking tuple to be used with BinaryFunction. + // See transform function which accepts two input iterators. + template + struct unpack_binary_op + { + using result_type = typename ::rocprim::detail::invoke_result::type; - ROCPRIM_HOST_DEVICE inline - unpack_binary_op() = default; + ROCPRIM_HOST_DEVICE inline unpack_binary_op() = default; - ROCPRIM_HOST_DEVICE inline - unpack_binary_op(BinaryFunction binary_op) : binary_op_(binary_op) - { - } + ROCPRIM_HOST_DEVICE inline unpack_binary_op(BinaryFunction binary_op) + : binary_op_(binary_op) + { + } - ROCPRIM_HOST_DEVICE inline - ~unpack_binary_op() = default; + ROCPRIM_HOST_DEVICE inline ~unpack_binary_op() = default; + + ROCPRIM_HOST_DEVICE inline result_type operator()(const ::rocprim::tuple& t) + { + return binary_op_(::rocprim::get<0>(t), ::rocprim::get<1>(t)); + } - ROCPRIM_HOST_DEVICE inline - result_type operator()(const ::rocprim::tuple& t) + private: + BinaryFunction binary_op_; + }; + + template + ROCPRIM_DEVICE inline void transform_kernel_impl(InputIterator input, + const size_t input_size, + OutputIterator output, + UnaryFunction transform_op) { - return binary_op_(::rocprim::get<0>(t), ::rocprim::get<1>(t)); - } + using input_type = typename std::iterator_traits::value_type; + using output_type = typename std::iterator_traits::value_type; + using result_type = typename std:: + conditional::value, ResultType, output_type>::type; -private: - BinaryFunction binary_op_; -}; - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class ResultType, - class InputIterator, - class OutputIterator, - class UnaryFunction -> -ROCPRIM_DEVICE inline -void transform_kernel_impl(InputIterator input, - const size_t input_size, - OutputIterator output, - UnaryFunction transform_op) -{ - using input_type = typename std::iterator_traits::value_type; - using output_type = typename std::iterator_traits::value_type; - using result_type = - typename std::conditional< - std::is_void::value, ResultType, output_type - >::type; + constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; + const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); + const unsigned int block_offset = flat_block_id * items_per_block; + const unsigned int number_of_blocks = ::rocprim::detail::grid_size<0>(); + const unsigned int valid_in_last_block = input_size - block_offset; - const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int flat_block_id = ::rocprim::detail::block_id<0>(); - const unsigned int block_offset = flat_block_id * items_per_block; - const unsigned int number_of_blocks = ::rocprim::detail::grid_size<0>(); - const unsigned int valid_in_last_block = input_size - block_offset; + input_type input_values[ItemsPerThread]; + result_type output_values[ItemsPerThread]; - input_type input_values[ItemsPerThread]; - result_type output_values[ItemsPerThread]; + if(flat_block_id == (number_of_blocks - 1)) // last block + { + block_load_direct_striped( + flat_id, input + block_offset, input_values, valid_in_last_block); - if(flat_block_id == (number_of_blocks - 1)) // last block - { - block_load_direct_striped( - flat_id, - input + block_offset, - input_values, - valid_in_last_block - ); - - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) + { + if(BlockSize * i + flat_id < valid_in_last_block) + { + output_values[i] = transform_op(input_values[i]); + } + } + + block_store_direct_striped( + flat_id, output + block_offset, output_values, valid_in_last_block); + } + else { - if(BlockSize * i + flat_id < valid_in_last_block) + block_load_direct_striped(flat_id, input + block_offset, input_values); + +#pragma unroll + for(unsigned int i = 0; i < ItemsPerThread; i++) { output_values[i] = transform_op(input_values[i]); } - } - block_store_direct_striped( - flat_id, - output + block_offset, - output_values, - valid_in_last_block - ); - } - else - { - block_load_direct_striped( - flat_id, - input + block_offset, - input_values - ); - - #pragma unroll - for(unsigned int i = 0; i < ItemsPerThread; i++) - { - output_values[i] = transform_op(input_values[i]); + block_store_direct_striped(flat_id, output + block_offset, output_values); } - - block_store_direct_striped( - flat_id, - output + block_offset, - output_values - ); } -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp index a1c5b1174..27e6852b2 100644 --- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp +++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp @@ -24,14 +24,14 @@ #include #include "../../intrinsics.hpp" -#include "../../types.hpp" #include "../../type_traits.hpp" +#include "../../types.hpp" #include "../../warp/detail/warp_reduce_crosslane.hpp" #include "../../warp/detail/warp_scan_crosslane.hpp" -#include "../../detail/various.hpp" #include "../../detail/binary_op_wrappers.hpp" +#include "../../detail/various.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -42,337 +42,307 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -enum prefix_flag -{ - // flag for padding, values should be discarded - PREFIX_INVALID = -1, - // initialized, not result in value - PREFIX_EMPTY = 0, - // partial prefix value (from single block) - PREFIX_PARTIAL = 1, - // final prefix value - PREFIX_COMPLETE = 2 -}; - -// lookback_scan_state object keeps track of prefixes status for -// a look-back prefix scan. Initially every prefix can be either -// invalid (padding values) or empty. One thread in a block should -// later set it to partial, and later to complete. -template -struct lookback_scan_state; - -// Packed flag and prefix value are loaded/stored in one atomic operation. -template -struct lookback_scan_state -{ -private: - using flag_type_ = char; - - // Type which is used in store/load operations of block prefix (flag and value). - // It is 32-bit or 64-bit int and can be loaded/stored using single atomic instruction. - using prefix_underlying_type = - typename std::conditional< - (sizeof(T) > 2), - unsigned long long, - unsigned int - >::type; - - static constexpr unsigned int padding = ::rocprim::warp_size(); - - // Helper struct - struct prefix_type + enum prefix_flag { - flag_type_ flag; - T value; - } __attribute__((aligned(sizeof(prefix_underlying_type)))); + // flag for padding, values should be discarded + PREFIX_INVALID = -1, + // initialized, not result in value + PREFIX_EMPTY = 0, + // partial prefix value (from single block) + PREFIX_PARTIAL = 1, + // final prefix value + PREFIX_COMPLETE = 2 + }; + + // lookback_scan_state object keeps track of prefixes status for + // a look-back prefix scan. Initially every prefix can be either + // invalid (padding values) or empty. One thread in a block should + // later set it to partial, and later to complete. + template + struct lookback_scan_state; + + // Packed flag and prefix value are loaded/stored in one atomic operation. + template + struct lookback_scan_state + { + private: + using flag_type_ = char; - static_assert(sizeof(prefix_underlying_type) == sizeof(prefix_type), ""); + // Type which is used in store/load operations of block prefix (flag and value). + // It is 32-bit or 64-bit int and can be loaded/stored using single atomic instruction. + using prefix_underlying_type = + typename std::conditional<(sizeof(T) > 2), unsigned long long, unsigned int>::type; -public: - // Type used for flag/flag of block prefix - using flag_type = flag_type_; - using value_type = T; + static constexpr unsigned int padding = ::rocprim::warp_size(); - // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes - ROCPRIM_HOST static inline - lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks) - { - (void) number_of_blocks; - lookback_scan_state state; - state.prefixes = reinterpret_cast(temp_storage); - return state; - } - - ROCPRIM_HOST static inline - size_t get_storage_size(const unsigned int number_of_blocks) - { - return sizeof(prefix_underlying_type) * (padding + number_of_blocks); - } + // Helper struct + struct prefix_type + { + flag_type_ flag; + T value; + } __attribute__((aligned(sizeof(prefix_underlying_type)))); - ROCPRIM_DEVICE inline - void initialize_prefix(const unsigned int block_id, - const unsigned int number_of_blocks) - { - if(block_id < number_of_blocks) + static_assert(sizeof(prefix_underlying_type) == sizeof(prefix_type), ""); + + public: + // Type used for flag/flag of block prefix + using flag_type = flag_type_; + using value_type = T; + + // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes + ROCPRIM_HOST static inline lookback_scan_state create(void* temp_storage, + const unsigned int number_of_blocks) { - prefix_type prefix; - prefix.flag = PREFIX_EMPTY; - prefix_underlying_type p; - __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); - prefixes[padding + block_id] = p; + (void)number_of_blocks; + lookback_scan_state state; + state.prefixes = reinterpret_cast(temp_storage); + return state; } - if(block_id < padding) + + ROCPRIM_HOST static inline size_t get_storage_size(const unsigned int number_of_blocks) + { + return sizeof(prefix_underlying_type) * (padding + number_of_blocks); + } + + ROCPRIM_DEVICE inline void initialize_prefix(const unsigned int block_id, + const unsigned int number_of_blocks) + { + if(block_id < number_of_blocks) + { + prefix_type prefix; + prefix.flag = PREFIX_EMPTY; + prefix_underlying_type p; + __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); + prefixes[padding + block_id] = p; + } + if(block_id < padding) + { + prefix_type prefix; + prefix.flag = PREFIX_INVALID; + prefix_underlying_type p; + __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); + prefixes[block_id] = p; + } + } + + ROCPRIM_DEVICE inline void set_partial(const unsigned int block_id, const T value) + { + this->set(block_id, PREFIX_PARTIAL, value); + } + + ROCPRIM_DEVICE inline void set_complete(const unsigned int block_id, const T value) + { + this->set(block_id, PREFIX_COMPLETE, value); + } + + // block_id must be > 0 + ROCPRIM_DEVICE inline void get(const unsigned int block_id, flag_type& flag, T& value) { prefix_type prefix; - prefix.flag = PREFIX_INVALID; + do + { + // atomic_add(..., 0) is used to load values atomically + prefix_underlying_type p + = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0); + __builtin_memcpy(&prefix, &p, sizeof(prefix_type)); + } while(prefix.flag == PREFIX_EMPTY); + + // return + flag = prefix.flag; + value = prefix.value; + } + + private: + ROCPRIM_DEVICE inline void + set(const unsigned int block_id, const flag_type flag, const T value) + { + prefix_type prefix = {flag, value}; prefix_underlying_type p; __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); - prefixes[block_id] = p; + ::rocprim::detail::atomic_exch(&prefixes[padding + block_id], p); } - } - ROCPRIM_DEVICE inline - void set_partial(const unsigned int block_id, const T value) - { - this->set(block_id, PREFIX_PARTIAL, value); - } + prefix_underlying_type* prefixes; + }; - ROCPRIM_DEVICE inline - void set_complete(const unsigned int block_id, const T value) + // Flag, partial and final prefixes are stored in separate arrays. + // Consistency ensured by memory fences between flag and prefixes load/store operations. + template + struct lookback_scan_state { - this->set(block_id, PREFIX_COMPLETE, value); - } + private: + static constexpr unsigned int padding = ::rocprim::warp_size(); - // block_id must be > 0 - ROCPRIM_DEVICE inline - void get(const unsigned int block_id, flag_type& flag, T& value) - { - prefix_type prefix; - do + public: + using flag_type = char; + using value_type = T; + + // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes + ROCPRIM_HOST static inline lookback_scan_state create(void* temp_storage, + const unsigned int number_of_blocks) { - // atomic_add(..., 0) is used to load values atomically - prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0); - __builtin_memcpy(&prefix, &p, sizeof(prefix_type)); - } while(prefix.flag == PREFIX_EMPTY); - - // return - flag = prefix.flag; - value = prefix.value; - } - -private: - ROCPRIM_DEVICE inline - void set(const unsigned int block_id, const flag_type flag, const T value) - { - prefix_type prefix = { flag, value }; - prefix_underlying_type p; - __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); - ::rocprim::detail::atomic_exch(&prefixes[padding + block_id], p); - } - - prefix_underlying_type * prefixes; -}; - -// Flag, partial and final prefixes are stored in separate arrays. -// Consistency ensured by memory fences between flag and prefixes load/store operations. -template -struct lookback_scan_state -{ -private: - static constexpr unsigned int padding = ::rocprim::warp_size(); + const auto n = padding + number_of_blocks; + lookback_scan_state state; -public: - using flag_type = char; - using value_type = T; + auto ptr = reinterpret_cast(temp_storage); - // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes - ROCPRIM_HOST static inline - lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks) - { - const auto n = padding + number_of_blocks; - lookback_scan_state state; + state.prefixes_flags = reinterpret_cast(ptr); + ptr += ::rocprim::detail::align_size(n * sizeof(flag_type)); - auto ptr = reinterpret_cast(temp_storage); + state.prefixes_partial_values = reinterpret_cast(ptr); + ptr += ::rocprim::detail::align_size(n * sizeof(T)); - state.prefixes_flags = reinterpret_cast(ptr); - ptr += ::rocprim::detail::align_size(n * sizeof(flag_type)); + state.prefixes_complete_values = reinterpret_cast(ptr); + return state; + } - state.prefixes_partial_values = reinterpret_cast(ptr); - ptr += ::rocprim::detail::align_size(n * sizeof(T)); + ROCPRIM_HOST static inline size_t get_storage_size(const unsigned int number_of_blocks) + { + const auto n = padding + number_of_blocks; + size_t size = ::rocprim::detail::align_size(n * sizeof(flag_type)); + size += 2 * ::rocprim::detail::align_size(n * sizeof(T)); + return size; + } - state.prefixes_complete_values = reinterpret_cast(ptr); - return state; - } + ROCPRIM_DEVICE inline void initialize_prefix(const unsigned int block_id, + const unsigned int number_of_blocks) + { + if(block_id < number_of_blocks) + { + prefixes_flags[padding + block_id] = PREFIX_EMPTY; + } + if(block_id < padding) + { + prefixes_flags[block_id] = PREFIX_INVALID; + } + } - ROCPRIM_HOST static inline - size_t get_storage_size(const unsigned int number_of_blocks) - { - const auto n = padding + number_of_blocks; - size_t size = ::rocprim::detail::align_size(n * sizeof(flag_type)); - size += 2 * ::rocprim::detail::align_size(n * sizeof(T)); - return size; - } - - ROCPRIM_DEVICE inline - void initialize_prefix(const unsigned int block_id, - const unsigned int number_of_blocks) - { - if(block_id < number_of_blocks) + ROCPRIM_DEVICE inline void set_partial(const unsigned int block_id, const T value) { - prefixes_flags[padding + block_id] = PREFIX_EMPTY; + store_volatile(&prefixes_partial_values[padding + block_id], value); + ::rocprim::detail::memory_fence_device(); + store_volatile(&prefixes_flags[padding + block_id], PREFIX_PARTIAL); } - if(block_id < padding) + + ROCPRIM_DEVICE inline void set_complete(const unsigned int block_id, const T value) { - prefixes_flags[block_id] = PREFIX_INVALID; + store_volatile(&prefixes_complete_values[padding + block_id], value); + ::rocprim::detail::memory_fence_device(); + store_volatile(&prefixes_flags[padding + block_id], PREFIX_COMPLETE); } - } - ROCPRIM_DEVICE inline - void set_partial(const unsigned int block_id, const T value) - { - store_volatile(&prefixes_partial_values[padding + block_id], value); - ::rocprim::detail::memory_fence_device(); - store_volatile(&prefixes_flags[padding + block_id], PREFIX_PARTIAL); - } + // block_id must be > 0 + ROCPRIM_DEVICE inline void get(const unsigned int block_id, flag_type& flag, T& value) + { + do + { + flag = load_volatile(&prefixes_flags[padding + block_id]); + ::rocprim::detail::memory_fence_device(); + } while(flag == PREFIX_EMPTY); - ROCPRIM_DEVICE inline - void set_complete(const unsigned int block_id, const T value) - { - store_volatile(&prefixes_complete_values[padding + block_id], value); - ::rocprim::detail::memory_fence_device(); - store_volatile(&prefixes_flags[padding + block_id], PREFIX_COMPLETE); - } - - // block_id must be > 0 - ROCPRIM_DEVICE inline - void get(const unsigned int block_id, flag_type& flag, T& value) + if(flag == PREFIX_PARTIAL) + value = load_volatile(&prefixes_partial_values[padding + block_id]); + else + value = load_volatile(&prefixes_complete_values[padding + block_id]); + } + + private: + flag_type* prefixes_flags; + // We need to separate arrays for partial and final prefixes, because + // value can be overwritten before flag is changed (flag and value are + // not stored in single instruction). + T* prefixes_partial_values; + T* prefixes_complete_values; + }; + + template + class lookback_scan_prefix_op { - do + using flag_type = typename LookbackScanState::flag_type; + static_assert(std::is_same::value, + "T must be LookbackScanState::value_type"); + + public: + ROCPRIM_DEVICE inline lookback_scan_prefix_op(unsigned int block_id, + BinaryFunction scan_op, + LookbackScanState& scan_state) + : block_id_(block_id) + , scan_op_(scan_op) + , scan_state_(scan_state) { - flag = load_volatile(&prefixes_flags[padding + block_id]); - ::rocprim::detail::memory_fence_device(); - } while(flag == PREFIX_EMPTY); - - if(flag == PREFIX_PARTIAL) - value = load_volatile(&prefixes_partial_values[padding + block_id]); - else - value = load_volatile(&prefixes_complete_values[padding + block_id]); - } - -private: - flag_type * prefixes_flags; - // We need to separate arrays for partial and final prefixes, because - // value can be overwritten before flag is changed (flag and value are - // not stored in single instruction). - T * prefixes_partial_values; - T * prefixes_complete_values; -}; - -template -class lookback_scan_prefix_op -{ - using flag_type = typename LookbackScanState::flag_type; - static_assert( - std::is_same::value, - "T must be LookbackScanState::value_type" - ); - -public: - ROCPRIM_DEVICE inline - lookback_scan_prefix_op(unsigned int block_id, - BinaryFunction scan_op, - LookbackScanState &scan_state) - : block_id_(block_id), - scan_op_(scan_op), - scan_state_(scan_state) - { - } + } - ROCPRIM_DEVICE inline - ~lookback_scan_prefix_op() = default; + ROCPRIM_DEVICE inline ~lookback_scan_prefix_op() = default; - ROCPRIM_DEVICE inline - void reduce_partial_prefixes(unsigned int block_id, - flag_type& flag, - T& partial_prefix) - { - // Order of reduction must be reversed, because 0th thread has - // prefix from the (block_id_ - 1) block, 1st thread has prefix - // from (block_id_ - 2) block etc. - using headflag_scan_op_type = reverse_binary_op_wrapper< - BinaryFunction, T, T - >; - using warp_reduce_prefix_type = warp_reduce_crosslane< - T, ::rocprim::warp_size(), false - >; - - T block_prefix; - scan_state_.get(block_id, flag, block_prefix); - - auto headflag_scan_op = headflag_scan_op_type(scan_op_); - warp_reduce_prefix_type() - .tail_segmented_reduce( - block_prefix, - partial_prefix, - (flag == PREFIX_COMPLETE), - headflag_scan_op - ); - } - - ROCPRIM_DEVICE inline - T get_prefix() - { - flag_type flag; - unsigned int previous_block_id = block_id_ - ::rocprim::lane_id() - 1; - bool is_prefix_initialized = false; - T prefix; + ROCPRIM_DEVICE inline void + reduce_partial_prefixes(unsigned int block_id, flag_type& flag, T& partial_prefix) + { + // Order of reduction must be reversed, because 0th thread has + // prefix from the (block_id_ - 1) block, 1st thread has prefix + // from (block_id_ - 2) block etc. + using headflag_scan_op_type = reverse_binary_op_wrapper; + using warp_reduce_prefix_type = warp_reduce_crosslane; + + T block_prefix; + scan_state_.get(block_id, flag, block_prefix); + + auto headflag_scan_op = headflag_scan_op_type(scan_op_); + warp_reduce_prefix_type().tail_segmented_reduce( + block_prefix, partial_prefix, (flag == PREFIX_COMPLETE), headflag_scan_op); + } - do + ROCPRIM_DEVICE inline T get_prefix() { - // reduce last warp_size() number of prefixes to - // get the complete prefix for this block. - T partial_prefix; - reduce_partial_prefixes(previous_block_id, flag, partial_prefix); - if(!is_prefix_initialized) + flag_type flag; + unsigned int previous_block_id = block_id_ - ::rocprim::lane_id() - 1; + bool is_prefix_initialized = false; + T prefix; + + do { - prefix = partial_prefix; - is_prefix_initialized = true; - } - else + // reduce last warp_size() number of prefixes to + // get the complete prefix for this block. + T partial_prefix; + reduce_partial_prefixes(previous_block_id, flag, partial_prefix); + if(!is_prefix_initialized) + { + prefix = partial_prefix; + is_prefix_initialized = true; + } + else + { + prefix = scan_op_(partial_prefix, prefix); + } + previous_block_id -= ::rocprim::warp_size(); + // while we don't load a complete prefix, reduce partial prefixes + } while(::rocprim::detail::warp_all(flag != PREFIX_COMPLETE)); + return prefix; + } + + ROCPRIM_DEVICE inline T operator()(T reduction) + { + // Set partial prefix for next block + if(::rocprim::lane_id() == 0) { - prefix = scan_op_(partial_prefix, prefix); + scan_state_.set_partial(block_id_, reduction); } - previous_block_id -= ::rocprim::warp_size(); - // while we don't load a complete prefix, reduce partial prefixes - } while(::rocprim::detail::warp_all(flag != PREFIX_COMPLETE)); - return prefix; - } - - ROCPRIM_DEVICE inline - T operator()(T reduction) - { - // Set partial prefix for next block - if(::rocprim::lane_id() == 0) - { - scan_state_.set_partial(block_id_, reduction); - } - // Get prefix - auto prefix = get_prefix(); + // Get prefix + auto prefix = get_prefix(); - // Set complete prefix for next block - if(::rocprim::lane_id() == 0) - { - scan_state_.set_complete(block_id_, scan_op_(prefix, reduction)); + // Set complete prefix for next block + if(::rocprim::lane_id() == 0) + { + scan_state_.set_complete(block_id_, scan_op_(prefix, reduction)); + } + return prefix; } - return prefix; - } - -protected: - unsigned int block_id_; - BinaryFunction scan_op_; - LookbackScanState& scan_state_; -}; + + protected: + unsigned int block_id_; + BinaryFunction scan_op_; + LookbackScanState& scan_state_; + }; } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/ordered_block_id.hpp b/rocprim/include/rocprim/device/detail/ordered_block_id.hpp index 118c6b9e5..abe854cd2 100644 --- a/rocprim/include/rocprim/device/detail/ordered_block_id.hpp +++ b/rocprim/include/rocprim/device/detail/ordered_block_id.hpp @@ -21,8 +21,8 @@ #ifndef ROCPRIM_DEVICE_DETAIL_ORDERED_BLOCK_ID_HPP_ #define ROCPRIM_DEVICE_DETAIL_ORDERED_BLOCK_ID_HPP_ -#include #include +#include #include "../../detail/various.hpp" #include "../../intrinsics.hpp" @@ -33,52 +33,48 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Helper struct for generating ordered unique ids for blocks in a grid. -template -struct ordered_block_id -{ - static_assert(std::is_integral::value, "T must be integer"); - using id_type = T; - - // shared memory temporary storage type - struct storage_type + // Helper struct for generating ordered unique ids for blocks in a grid. + template + struct ordered_block_id { - id_type id; - }; + static_assert(std::is_integral::value, "T must be integer"); + using id_type = T; - ROCPRIM_HOST static inline - ordered_block_id create(id_type * id) - { - ordered_block_id ordered_id; - ordered_id.id = id; - return ordered_id; - } + // shared memory temporary storage type + struct storage_type + { + id_type id; + }; - ROCPRIM_HOST static inline - size_t get_storage_size() - { - return sizeof(id_type); - } + ROCPRIM_HOST static inline ordered_block_id create(id_type* id) + { + ordered_block_id ordered_id; + ordered_id.id = id; + return ordered_id; + } - ROCPRIM_DEVICE inline - void reset() - { - *id = static_cast(0); - } + ROCPRIM_HOST static inline size_t get_storage_size() + { + return sizeof(id_type); + } - ROCPRIM_DEVICE inline - id_type get(unsigned int tid, storage_type& storage) - { - if(tid == 0) + ROCPRIM_DEVICE inline void reset() { - storage.id = ::rocprim::detail::atomic_add(this->id, 1); + *id = static_cast(0); } - ::rocprim::syncthreads(); - return storage.id; - } - id_type* id; -}; + ROCPRIM_DEVICE inline id_type get(unsigned int tid, storage_type& storage) + { + if(tid == 0) + { + storage.id = ::rocprim::detail::atomic_add(this->id, 1); + } + ::rocprim::syncthreads(); + return storage.id; + } + + id_type* id; + }; } // end of detail namespace diff --git a/rocprim/include/rocprim/device/detail/uint_fast_div.hpp b/rocprim/include/rocprim/device/detail/uint_fast_div.hpp index 509019c97..54ef98417 100644 --- a/rocprim/include/rocprim/device/detail/uint_fast_div.hpp +++ b/rocprim/include/rocprim/device/detail/uint_fast_div.hpp @@ -28,81 +28,86 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Fast division by unsigned "constant" -// Used for fast division on device by precomputing magic numbers on host, -// hence no division by arbitrary values in kernel code. -// Hacker's Delight, Chapter 10, Integer Division By Constants (http://www.hackersdelight.org/) -// http://www.hackersdelight.org/hdcodetxt/magicu.c.txt -struct uint_fast_div -{ - unsigned int magic; // Magic number - unsigned int shift; // shift amount - unsigned int add; // "add" indicator - - ROCPRIM_HOST_DEVICE inline - uint_fast_div() = default; - - ROCPRIM_HOST_DEVICE inline - uint_fast_div(unsigned int d) + // Fast division by unsigned "constant" + // Used for fast division on device by precomputing magic numbers on host, + // hence no division by arbitrary values in kernel code. + // Hacker's Delight, Chapter 10, Integer Division By Constants (http://www.hackersdelight.org/) + // http://www.hackersdelight.org/hdcodetxt/magicu.c.txt + struct uint_fast_div { - // Must have 1 <= d <= 2**32-1. + unsigned int magic; // Magic number + unsigned int shift; // shift amount + unsigned int add; // "add" indicator + + ROCPRIM_HOST_DEVICE inline uint_fast_div() = default; - if(d == 1) + ROCPRIM_HOST_DEVICE inline uint_fast_div(unsigned int d) { - magic = 0; - shift = 0; - add = 0; - return; - } + // Must have 1 <= d <= 2**32-1. - int p; - unsigned int p32 = 1, q, r, delta; - add = 0; // Initialize "add" indicator. - p = 31; // Initialize p. - q = 0x7FFFFFFF/d; // Initialize q = (2**p - 1)/d. - r = 0x7FFFFFFF - q*d; // Init. r = rem(2**p - 1, d). - do { - p = p + 1; - if(p == 32) p32 = 1; // Set p32 = 2**(p-32). - else p32 = 2*p32; - if(r + 1 >= d - r) + if(d == 1) { - if(q >= 0x7FFFFFFF) add = 1; - q = 2*q + 1; - r = 2*r + 1 - d; + magic = 0; + shift = 0; + add = 0; + return; } - else + + int p; + unsigned int p32 = 1, q, r, delta; + add = 0; // Initialize "add" indicator. + p = 31; // Initialize p. + q = 0x7FFFFFFF / d; // Initialize q = (2**p - 1)/d. + r = 0x7FFFFFFF - q * d; // Init. r = rem(2**p - 1, d). + do { - if(q >= 0x80000000) add = 1; - q = 2*q; - r = 2*r + 1; - } - delta = d - 1 - r; - } while (p < 64 && p32 < delta); - magic = q + 1; // Magic number and - shift = p - 32; // shift amount + p = p + 1; + if(p == 32) + p32 = 1; // Set p32 = 2**(p-32). + else + p32 = 2 * p32; + if(r + 1 >= d - r) + { + if(q >= 0x7FFFFFFF) + add = 1; + q = 2 * q + 1; + r = 2 * r + 1 - d; + } + else + { + if(q >= 0x80000000) + add = 1; + q = 2 * q; + r = 2 * r + 1; + } + delta = d - 1 - r; + } while(p < 64 && p32 < delta); + magic = q + 1; // Magic number and + shift = p - 32; // shift amount - if(add) shift--; - } -}; + if(add) + shift--; + } + }; -ROCPRIM_HOST_DEVICE inline -unsigned int operator/(unsigned int n, const uint_fast_div& divisor) -{ - if(divisor.magic == 0) + ROCPRIM_HOST_DEVICE inline unsigned int operator/(unsigned int n, const uint_fast_div& divisor) { - // Special case for 1 - return n; - } + if(divisor.magic == 0) + { + // Special case for 1 + return n; + } - // Higher 32-bit of 64-bit multiplication - unsigned int q = (static_cast(divisor.magic) * static_cast(n)) >> 32; - if(divisor.add) - { - q = ((n - q) >> 1) + q; + // Higher 32-bit of 64-bit multiplication + unsigned int q + = (static_cast(divisor.magic) * static_cast(n)) + >> 32; + if(divisor.add) + { + q = ((n - q) >> 1) + q; + } + return q >> divisor.shift; } - return q >> divisor.shift; -} } // end of detail namespace diff --git a/rocprim/include/rocprim/device/device_binary_search.hpp b/rocprim/include/rocprim/device/device_binary_search.hpp index 5f655b227..e8a817d10 100644 --- a/rocprim/include/rocprim/device/device_binary_search.hpp +++ b/rocprim/include/rocprim/device/device_binary_search.hpp @@ -21,8 +21,8 @@ #ifndef ROCPRIM_DEVICE_DEVICE_BINARY_SEARCH_HPP_ #define ROCPRIM_DEVICE_DEVICE_BINARY_SEARCH_HPP_ -#include #include +#include #include "../config.hpp" #include "../detail/various.hpp" @@ -39,134 +39,133 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class Config, - class HaystackIterator, - class NeedlesIterator, - class OutputIterator, - class SearchFunction, - class CompareFunction -> -inline -hipError_t binary_search(void * temporary_storage, - size_t& storage_size, - HaystackIterator haystack, - NeedlesIterator needles, - OutputIterator output, - size_t haystack_size, - size_t needles_size, - SearchFunction search_op, - CompareFunction compare_op, - hipStream_t stream, - bool debug_synchronous) -{ - using value_type = typename std::iterator_traits::value_type; - - if(temporary_storage == nullptr) + template + inline hipError_t binary_search(void* temporary_storage, + size_t& storage_size, + HaystackIterator haystack, + NeedlesIterator needles, + OutputIterator output, + size_t haystack_size, + size_t needles_size, + SearchFunction search_op, + CompareFunction compare_op, + hipStream_t stream, + bool debug_synchronous) { - // Make sure user won't try to allocate 0 bytes memory, otherwise - // user may again pass nullptr as temporary_storage - storage_size = 4; - return hipSuccess; - } + using value_type = typename std::iterator_traits::value_type; - return transform( - needles, output, - needles_size, - [haystack, haystack_size, search_op, compare_op] - ROCPRIM_DEVICE - (const value_type& value) + if(temporary_storage == nullptr) { - return search_op(haystack, haystack_size, value, compare_op); - }, - stream, debug_synchronous - ); -} + // Make sure user won't try to allocate 0 bytes memory, otherwise + // user may again pass nullptr as temporary_storage + storage_size = 4; + return hipSuccess; + } + + return transform( + needles, + output, + needles_size, + [haystack, haystack_size, search_op, compare_op] ROCPRIM_DEVICE( + const value_type& value) { + return search_op(haystack, haystack_size, value, compare_op); + }, + stream, + debug_synchronous); + } } // end of detail namespace -template< - class Config = default_config, - class HaystackIterator, - class NeedlesIterator, - class OutputIterator, - class CompareFunction = ::rocprim::less<> -> -inline -hipError_t lower_bound(void * temporary_storage, - size_t& storage_size, - HaystackIterator haystack, - NeedlesIterator needles, - OutputIterator output, - size_t haystack_size, - size_t needles_size, - CompareFunction compare_op = CompareFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template > +inline hipError_t lower_bound(void* temporary_storage, + size_t& storage_size, + HaystackIterator haystack, + NeedlesIterator needles, + OutputIterator output, + size_t haystack_size, + size_t needles_size, + CompareFunction compare_op = CompareFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::binary_search( - temporary_storage, storage_size, - haystack, needles, output, - haystack_size, needles_size, - detail::lower_bound_search_op(), compare_op, - stream, debug_synchronous - ); + return detail::binary_search(temporary_storage, + storage_size, + haystack, + needles, + output, + haystack_size, + needles_size, + detail::lower_bound_search_op(), + compare_op, + stream, + debug_synchronous); } -template< - class Config = default_config, - class HaystackIterator, - class NeedlesIterator, - class OutputIterator, - class CompareFunction = ::rocprim::less<> -> -inline -hipError_t upper_bound(void * temporary_storage, - size_t& storage_size, - HaystackIterator haystack, - NeedlesIterator needles, - OutputIterator output, - size_t haystack_size, - size_t needles_size, - CompareFunction compare_op = CompareFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template > +inline hipError_t upper_bound(void* temporary_storage, + size_t& storage_size, + HaystackIterator haystack, + NeedlesIterator needles, + OutputIterator output, + size_t haystack_size, + size_t needles_size, + CompareFunction compare_op = CompareFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::binary_search( - temporary_storage, storage_size, - haystack, needles, output, - haystack_size, needles_size, - detail::upper_bound_search_op(), compare_op, - stream, debug_synchronous - ); + return detail::binary_search(temporary_storage, + storage_size, + haystack, + needles, + output, + haystack_size, + needles_size, + detail::upper_bound_search_op(), + compare_op, + stream, + debug_synchronous); } -template< - class Config = default_config, - class HaystackIterator, - class NeedlesIterator, - class OutputIterator, - class CompareFunction = ::rocprim::less<> -> -inline -hipError_t binary_search(void * temporary_storage, - size_t& storage_size, - HaystackIterator haystack, - NeedlesIterator needles, - OutputIterator output, - size_t haystack_size, - size_t needles_size, - CompareFunction compare_op = CompareFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template > +inline hipError_t binary_search(void* temporary_storage, + size_t& storage_size, + HaystackIterator haystack, + NeedlesIterator needles, + OutputIterator output, + size_t haystack_size, + size_t needles_size, + CompareFunction compare_op = CompareFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::binary_search( - temporary_storage, storage_size, - haystack, needles, output, - haystack_size, needles_size, - detail::binary_search_op(), compare_op, - stream, debug_synchronous - ); + return detail::binary_search(temporary_storage, + storage_size, + haystack, + needles, + output, + haystack_size, + needles_size, + detail::binary_search_op(), + compare_op, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_histogram.hpp b/rocprim/include/rocprim/device/device_histogram.hpp index 4e561e78e..20c266f86 100644 --- a/rocprim/include/rocprim/device/device_histogram.hpp +++ b/rocprim/include/rocprim/device/device_histogram.hpp @@ -22,15 +22,15 @@ #define ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_ #include -#include #include +#include #include "../config.hpp" -#include "../functional.hpp" #include "../detail/various.hpp" +#include "../functional.hpp" -#include "device_histogram_config.hpp" #include "detail/device_histogram.hpp" +#include "device_histogram_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -40,307 +40,312 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - unsigned int BlockSize, - unsigned int ActiveChannels, - class Counter -> -__global__ -void init_histogram_kernel(fixed_array histogram, - fixed_array bins) -{ - init_histogram(histogram, bins); -} + template + __global__ void init_histogram_kernel(fixed_array histogram, + fixed_array bins) + { + init_histogram(histogram, bins); + } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - unsigned int ActiveChannels, - class SampleIterator, - class Counter, - class SampleToBinOp -> -__global__ -void histogram_shared_kernel(SampleIterator samples, - unsigned int columns, - unsigned int rows, - unsigned int row_stride, - unsigned int rows_per_block, - fixed_array histogram, - fixed_array sample_to_bin_op, - fixed_array bins) -{ - HIP_DYNAMIC_SHARED(unsigned int, block_histogram); + template + __global__ void + histogram_shared_kernel(SampleIterator samples, + unsigned int columns, + unsigned int rows, + unsigned int row_stride, + unsigned int rows_per_block, + fixed_array histogram, + fixed_array sample_to_bin_op, + fixed_array bins) + { + HIP_DYNAMIC_SHARED(unsigned int, block_histogram); - histogram_shared( - samples, columns, rows, row_stride, rows_per_block, - histogram, - sample_to_bin_op, bins, - block_histogram - ); -} + histogram_shared(samples, + columns, + rows, + row_stride, + rows_per_block, + histogram, + sample_to_bin_op, + bins, + block_histogram); + } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int Channels, - unsigned int ActiveChannels, - class SampleIterator, - class Counter, - class SampleToBinOp -> -__global__ -void histogram_global_kernel(SampleIterator samples, - unsigned int columns, - unsigned int row_stride, - fixed_array histogram, - fixed_array sample_to_bin_op, - fixed_array bins_bits) -{ - histogram_global( - samples, columns, row_stride, - histogram, - sample_to_bin_op, bins_bits - ); -} + template + __global__ void + histogram_global_kernel(SampleIterator samples, + unsigned int columns, + unsigned int row_stride, + fixed_array histogram, + fixed_array sample_to_bin_op, + fixed_array bins_bits) + { + histogram_global( + samples, columns, row_stride, histogram, sample_to_bin_op, bins_bits); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - unsigned int Channels, - unsigned int ActiveChannels, - class Config, - class SampleIterator, - class Counter, - class SampleToBinOp -> -inline -hipError_t histogram_impl(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int columns, - unsigned int rows, - size_t row_stride_bytes, - Counter * histogram[ActiveChannels], - unsigned int levels[ActiveChannels], - SampleToBinOp sample_to_bin_op[ActiveChannels], - hipStream_t stream, - bool debug_synchronous) -{ - using sample_type = typename std::iterator_traits::value_type; + template + inline hipError_t histogram_impl(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int columns, + unsigned int rows, + size_t row_stride_bytes, + Counter* histogram[ActiveChannels], + unsigned int levels[ActiveChannels], + SampleToBinOp sample_to_bin_op[ActiveChannels], + hipStream_t stream, + bool debug_synchronous) + { + using sample_type = typename std::iterator_traits::value_type; - using config = default_or_custom_config< - Config, - default_histogram_config - >; + using config = default_or_custom_config< + Config, + default_histogram_config>; - constexpr unsigned int block_size = config::histogram::block_size; - constexpr unsigned int items_per_thread = config::histogram::items_per_thread; - constexpr unsigned int items_per_block = block_size * items_per_thread; + constexpr unsigned int block_size = config::histogram::block_size; + constexpr unsigned int items_per_thread = config::histogram::items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; - if(row_stride_bytes % sizeof(sample_type) != 0) - { - // Row stride must be a whole multiple of the sample data type size - return hipErrorInvalidValue; - } + if(row_stride_bytes % sizeof(sample_type) != 0) + { + // Row stride must be a whole multiple of the sample data type size + return hipErrorInvalidValue; + } - const unsigned int blocks_x = ::rocprim::detail::ceiling_div(columns, items_per_block); - const unsigned int row_stride = row_stride_bytes / sizeof(sample_type); + const unsigned int blocks_x = ::rocprim::detail::ceiling_div(columns, items_per_block); + const unsigned int row_stride = row_stride_bytes / sizeof(sample_type); - if(temporary_storage == nullptr) - { - // Make sure user won't try to allocate 0 bytes memory, because - // hipMalloc will return nullptr. - storage_size = 4; - return hipSuccess; - } + if(temporary_storage == nullptr) + { + // Make sure user won't try to allocate 0 bytes memory, because + // hipMalloc will return nullptr. + storage_size = 4; + return hipSuccess; + } - if(debug_synchronous) - { - std::cout << "columns " << columns << '\n'; - std::cout << "rows " << rows << '\n'; - std::cout << "blocks_x " << blocks_x << '\n'; - hipError_t error = hipStreamSynchronize(stream); - if(error != hipSuccess) return error; - } + if(debug_synchronous) + { + std::cout << "columns " << columns << '\n'; + std::cout << "rows " << rows << '\n'; + std::cout << "blocks_x " << blocks_x << '\n'; + hipError_t error = hipStreamSynchronize(stream); + if(error != hipSuccess) + return error; + } - unsigned int bins[ActiveChannels]; - unsigned int bins_bits[ActiveChannels]; - unsigned int total_bins = 0; - unsigned int max_bins = 0; - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - bins[channel] = levels[channel] - 1; - bins_bits[channel] = static_cast(std::log2(detail::next_power_of_two(bins[channel]))); - total_bins += bins[channel]; - max_bins = std::max(max_bins, bins[channel]); - } + unsigned int bins[ActiveChannels]; + unsigned int bins_bits[ActiveChannels]; + unsigned int total_bins = 0; + unsigned int max_bins = 0; + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + bins[channel] = levels[channel] - 1; + bins_bits[channel] + = static_cast(std::log2(detail::next_power_of_two(bins[channel]))); + total_bins += bins[channel]; + max_bins = std::max(max_bins, bins[channel]); + } - std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(init_histogram_kernel), - dim3(::rocprim::detail::ceiling_div(max_bins, block_size)), dim3(block_size), 0, stream, - fixed_array(histogram), - fixed_array(bins) - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_histogram", max_bins, start); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_histogram_kernel), + dim3(::rocprim::detail::ceiling_div(max_bins, block_size)), + dim3(block_size), + 0, + stream, + fixed_array(histogram), + fixed_array(bins)); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_histogram", max_bins, start); + + if(columns == 0 || rows == 0) + { + return hipSuccess; + } + + if(total_bins <= config::shared_impl_max_bins) + { + dim3 grid_size; + grid_size.x = std::min(config::max_grid_size, blocks_x); + grid_size.y = std::min(rows, config::max_grid_size / grid_size.x); + const size_t block_histogram_bytes = total_bins * sizeof(unsigned int); + const unsigned int rows_per_block = ::rocprim::detail::ceiling_div(rows, grid_size.y); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(histogram_shared_kernel), + grid_size, + dim3(block_size, 1), + block_histogram_bytes, + stream, + samples, + columns, + rows, + row_stride, + rows_per_block, + fixed_array(histogram), + fixed_array(sample_to_bin_op), + fixed_array(bins)); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR( + "histogram_shared", grid_size.x * grid_size.y * block_size, start); + } + else + { + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(histogram_global_kernel), + dim3(blocks_x, rows), + dim3(block_size, 1), + 0, + stream, + samples, + columns, + row_stride, + fixed_array(histogram), + fixed_array(sample_to_bin_op), + fixed_array(bins_bits)); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR( + "histogram_global", blocks_x * block_size * rows, start); + } - if(columns == 0 || rows == 0) - { return hipSuccess; } - if(total_bins <= config::shared_impl_max_bins) + template + inline hipError_t histogram_even_impl(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int columns, + unsigned int rows, + size_t row_stride_bytes, + Counter* histogram[ActiveChannels], + unsigned int levels[ActiveChannels], + Level lower_level[ActiveChannels], + Level upper_level[ActiveChannels], + hipStream_t stream, + bool debug_synchronous) { - dim3 grid_size; - grid_size.x = std::min(config::max_grid_size, blocks_x); - grid_size.y = std::min(rows, config::max_grid_size / grid_size.x); - const size_t block_histogram_bytes = total_bins * sizeof(unsigned int); - const unsigned int rows_per_block = ::rocprim::detail::ceiling_div(rows, grid_size.y); - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(histogram_shared_kernel< - block_size, items_per_thread, Channels, ActiveChannels - >), - grid_size, dim3(block_size, 1), block_histogram_bytes, stream, - samples, columns, rows, row_stride, rows_per_block, - fixed_array(histogram), - fixed_array(sample_to_bin_op), - fixed_array(bins) - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("histogram_shared", grid_size.x * grid_size.y * block_size, start); - } - else - { - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(histogram_global_kernel< - block_size, items_per_thread, Channels, ActiveChannels - >), - dim3(blocks_x, rows), dim3(block_size, 1), 0, stream, - samples, columns, row_stride, - fixed_array(histogram), - fixed_array(sample_to_bin_op), - fixed_array(bins_bits) - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("histogram_global", blocks_x * block_size * rows, start); - } - - return hipSuccess; -} + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + if(levels[channel] < 2) + { + // Histogram must have at least 1 bin + return hipErrorInvalidValue; + } + } -template< - unsigned int Channels, - unsigned int ActiveChannels, - class Config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t histogram_even_impl(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int columns, - unsigned int rows, - size_t row_stride_bytes, - Counter * histogram[ActiveChannels], - unsigned int levels[ActiveChannels], - Level lower_level[ActiveChannels], - Level upper_level[ActiveChannels], - hipStream_t stream, - bool debug_synchronous) -{ - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - if(levels[channel] < 2) + sample_to_bin_even sample_to_bin_op[ActiveChannels]; + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - // Histogram must have at least 1 bin - return hipErrorInvalidValue; + sample_to_bin_op[channel] = sample_to_bin_even( + levels[channel] - 1, lower_level[channel], upper_level[channel]); } - } - sample_to_bin_even sample_to_bin_op[ActiveChannels]; - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - sample_to_bin_op[channel] = sample_to_bin_even( - levels[channel] - 1, - lower_level[channel], upper_level[channel] - ); + return histogram_impl(temporary_storage, + storage_size, + samples, + columns, + rows, + row_stride_bytes, + histogram, + levels, + sample_to_bin_op, + stream, + debug_synchronous); } - return histogram_impl( - temporary_storage, storage_size, - samples, columns, rows, row_stride_bytes, - histogram, - levels, sample_to_bin_op, - stream, debug_synchronous - ); -} - -template< - unsigned int Channels, - unsigned int ActiveChannels, - class Config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t histogram_range_impl(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int columns, - unsigned int rows, - size_t row_stride_bytes, - Counter * histogram[ActiveChannels], - unsigned int levels[ActiveChannels], - Level * level_values[ActiveChannels], - hipStream_t stream, - bool debug_synchronous) -{ - for(unsigned int channel = 0; channel < ActiveChannels; channel++) + template + inline hipError_t histogram_range_impl(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int columns, + unsigned int rows, + size_t row_stride_bytes, + Counter* histogram[ActiveChannels], + unsigned int levels[ActiveChannels], + Level* level_values[ActiveChannels], + hipStream_t stream, + bool debug_synchronous) { - if(levels[channel] < 2) + for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - // Histogram must have at least 1 bin - return hipErrorInvalidValue; + if(levels[channel] < 2) + { + // Histogram must have at least 1 bin + return hipErrorInvalidValue; + } } - } - sample_to_bin_range sample_to_bin_op[ActiveChannels]; - for(unsigned int channel = 0; channel < ActiveChannels; channel++) - { - sample_to_bin_op[channel] = sample_to_bin_range( - levels[channel] - 1, - level_values[channel] - ); - } + sample_to_bin_range sample_to_bin_op[ActiveChannels]; + for(unsigned int channel = 0; channel < ActiveChannels; channel++) + { + sample_to_bin_op[channel] + = sample_to_bin_range(levels[channel] - 1, level_values[channel]); + } - return histogram_impl( - temporary_storage, storage_size, - samples, columns, rows, row_stride_bytes, - histogram, - levels, sample_to_bin_op, - stream, debug_synchronous - ); -} + return histogram_impl(temporary_storage, + storage_size, + samples, + columns, + rows, + row_stride_bytes, + histogram, + levels, + sample_to_bin_op, + stream, + debug_synchronous); + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR @@ -415,36 +420,35 @@ hipError_t histogram_range_impl(void * temporary_storage, /// // histogram: [3, 0, 1, 0, 2] /// \endcode /// \endparblock -template< - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t histogram_even(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int size, - Counter * histogram, - unsigned int levels, - Level lower_level, - Level upper_level, - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t histogram_even(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int size, + Counter* histogram, + unsigned int levels, + Level lower_level, + Level upper_level, + hipStream_t stream = 0, + bool debug_synchronous = false) { - Counter * histogram_single[1] = { histogram }; - unsigned int levels_single[1] = { levels }; - Level lower_level_single[1] = { lower_level }; - Level upper_level_single[1] = { upper_level }; + Counter* histogram_single[1] = {histogram}; + unsigned int levels_single[1] = {levels}; + Level lower_level_single[1] = {lower_level}; + Level upper_level_single[1] = {upper_level}; - return detail::histogram_even_impl<1, 1, Config>( - temporary_storage, storage_size, - samples, size, 1, 0, - histogram_single, - levels_single, lower_level_single, upper_level_single, - stream, debug_synchronous - ); + return detail::histogram_even_impl<1, 1, Config>(temporary_storage, + storage_size, + samples, + size, + 1, + 0, + histogram_single, + levels_single, + lower_level_single, + upper_level_single, + stream, + debug_synchronous); } /// \brief Computes a histogram from a two-dimensional region of samples using equal-width bins. @@ -524,38 +528,37 @@ hipError_t histogram_even(void * temporary_storage, /// // histogram: [3, 0, 1, 0, 2] /// \endcode /// \endparblock -template< - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t histogram_even(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int columns, - unsigned int rows, - size_t row_stride_bytes, - Counter * histogram, - unsigned int levels, - Level lower_level, - Level upper_level, - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t histogram_even(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int columns, + unsigned int rows, + size_t row_stride_bytes, + Counter* histogram, + unsigned int levels, + Level lower_level, + Level upper_level, + hipStream_t stream = 0, + bool debug_synchronous = false) { - Counter * histogram_single[1] = { histogram }; - unsigned int levels_single[1] = { levels }; - Level lower_level_single[1] = { lower_level }; - Level upper_level_single[1] = { upper_level }; + Counter* histogram_single[1] = {histogram}; + unsigned int levels_single[1] = {levels}; + Level lower_level_single[1] = {lower_level}; + Level upper_level_single[1] = {upper_level}; - return detail::histogram_even_impl<1, 1, Config>( - temporary_storage, storage_size, - samples, columns, rows, row_stride_bytes, - histogram_single, - levels_single, lower_level_single, upper_level_single, - stream, debug_synchronous - ); + return detail::histogram_even_impl<1, 1, Config>(temporary_storage, + storage_size, + samples, + columns, + rows, + row_stride_bytes, + histogram_single, + levels_single, + lower_level_single, + upper_level_single, + stream, + debug_synchronous); } /// \brief Computes histograms from a sequence of multi-channel samples using equal-width bins. @@ -636,33 +639,35 @@ hipError_t histogram_even(void * temporary_storage, /// // [2, 2, 0, 0, 0, 2, 2, ..., 0]] /// \endcode /// \endparblock -template< - unsigned int Channels, - unsigned int ActiveChannels, - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t multi_histogram_even(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int size, - Counter * histogram[ActiveChannels], - unsigned int levels[ActiveChannels], - Level lower_level[ActiveChannels], - Level upper_level[ActiveChannels], - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t multi_histogram_even(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int size, + Counter* histogram[ActiveChannels], + unsigned int levels[ActiveChannels], + Level lower_level[ActiveChannels], + Level upper_level[ActiveChannels], + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::histogram_even_impl( - temporary_storage, storage_size, - samples, size, 1, 0, - histogram, - levels, lower_level, upper_level, - stream, debug_synchronous - ); + return detail::histogram_even_impl(temporary_storage, + storage_size, + samples, + size, + 1, + 0, + histogram, + levels, + lower_level, + upper_level, + stream, + debug_synchronous); } /// \brief Computes histograms from a two-dimensional region of multi-channel samples using equal-width bins. @@ -751,35 +756,37 @@ hipError_t multi_histogram_even(void * temporary_storage, /// // [2, 2, 0, 0, 0, 2, 2, ..., 0]] /// \endcode /// \endparblock -template< - unsigned int Channels, - unsigned int ActiveChannels, - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t multi_histogram_even(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int columns, - unsigned int rows, - size_t row_stride_bytes, - Counter * histogram[ActiveChannels], - unsigned int levels[ActiveChannels], - Level lower_level[ActiveChannels], - Level upper_level[ActiveChannels], - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t multi_histogram_even(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int columns, + unsigned int rows, + size_t row_stride_bytes, + Counter* histogram[ActiveChannels], + unsigned int levels[ActiveChannels], + Level lower_level[ActiveChannels], + Level upper_level[ActiveChannels], + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::histogram_even_impl( - temporary_storage, storage_size, - samples, columns, rows, row_stride_bytes, - histogram, - levels, lower_level, upper_level, - stream, debug_synchronous - ); + return detail::histogram_even_impl(temporary_storage, + storage_size, + samples, + columns, + rows, + row_stride_bytes, + histogram, + levels, + lower_level, + upper_level, + stream, + debug_synchronous); } /// \brief Computes a histogram from a sequence of samples using the specified bin boundary levels. @@ -848,34 +855,32 @@ hipError_t multi_histogram_even(void * temporary_storage, /// // histogram: [1, 2, 3, 0, 0] /// \endcode /// \endparblock -template< - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t histogram_range(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int size, - Counter * histogram, - unsigned int levels, - Level * level_values, - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t histogram_range(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int size, + Counter* histogram, + unsigned int levels, + Level* level_values, + hipStream_t stream = 0, + bool debug_synchronous = false) { - Counter * histogram_single[1] = { histogram }; - unsigned int levels_single[1] = { levels }; - Level * level_values_single[1] = { level_values }; + Counter* histogram_single[1] = {histogram}; + unsigned int levels_single[1] = {levels}; + Level* level_values_single[1] = {level_values}; - return detail::histogram_range_impl<1, 1, Config>( - temporary_storage, storage_size, - samples, size, 1, 0, - histogram_single, - levels_single, level_values_single, - stream, debug_synchronous - ); + return detail::histogram_range_impl<1, 1, Config>(temporary_storage, + storage_size, + samples, + size, + 1, + 0, + histogram_single, + levels_single, + level_values_single, + stream, + debug_synchronous); } /// \brief Computes a histogram from a two-dimensional region of samples using the specified bin boundary levels. @@ -952,36 +957,34 @@ hipError_t histogram_range(void * temporary_storage, /// // histogram: [1, 2, 3, 0, 0] /// \endcode /// \endparblock -template< - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t histogram_range(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int columns, - unsigned int rows, - size_t row_stride_bytes, - Counter * histogram, - unsigned int levels, - Level * level_values, - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t histogram_range(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int columns, + unsigned int rows, + size_t row_stride_bytes, + Counter* histogram, + unsigned int levels, + Level* level_values, + hipStream_t stream = 0, + bool debug_synchronous = false) { - Counter * histogram_single[1] = { histogram }; - unsigned int levels_single[1] = { levels }; - Level * level_values_single[1] = { level_values }; + Counter* histogram_single[1] = {histogram}; + unsigned int levels_single[1] = {levels}; + Level* level_values_single[1] = {level_values}; - return detail::histogram_range_impl<1, 1, Config>( - temporary_storage, storage_size, - samples, columns, rows, row_stride_bytes, - histogram_single, - levels_single, level_values_single, - stream, debug_synchronous - ); + return detail::histogram_range_impl<1, 1, Config>(temporary_storage, + storage_size, + samples, + columns, + rows, + row_stride_bytes, + histogram_single, + levels_single, + level_values_single, + stream, + debug_synchronous); } /// \brief Computes histograms from a sequence of multi-channel samples using the specified bin boundary levels. @@ -1058,32 +1061,33 @@ hipError_t histogram_range(void * temporary_storage, /// // histogram: [[2, 4, 2], [7, 0, 1], [2, 6]] /// \endcode /// \endparblock -template< - unsigned int Channels, - unsigned int ActiveChannels, - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t multi_histogram_range(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int size, - Counter * histogram[ActiveChannels], - unsigned int levels[ActiveChannels], - Level * level_values[ActiveChannels], - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t multi_histogram_range(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int size, + Counter* histogram[ActiveChannels], + unsigned int levels[ActiveChannels], + Level* level_values[ActiveChannels], + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::histogram_range_impl( - temporary_storage, storage_size, - samples, size, 1, 0, - histogram, - levels, level_values, - stream, debug_synchronous - ); + return detail::histogram_range_impl(temporary_storage, + storage_size, + samples, + size, + 1, + 0, + histogram, + levels, + level_values, + stream, + debug_synchronous); } /// \brief Computes histograms from a two-dimensional region of multi-channel samples using the specified bin @@ -1169,34 +1173,35 @@ hipError_t multi_histogram_range(void * temporary_storage, /// // histogram: [[2, 4, 2], [7, 0, 1], [2, 6]] /// \endcode /// \endparblock -template< - unsigned int Channels, - unsigned int ActiveChannels, - class Config = default_config, - class SampleIterator, - class Counter, - class Level -> -inline -hipError_t multi_histogram_range(void * temporary_storage, - size_t& storage_size, - SampleIterator samples, - unsigned int columns, - unsigned int rows, - size_t row_stride_bytes, - Counter * histogram[ActiveChannels], - unsigned int levels[ActiveChannels], - Level * level_values[ActiveChannels], - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t multi_histogram_range(void* temporary_storage, + size_t& storage_size, + SampleIterator samples, + unsigned int columns, + unsigned int rows, + size_t row_stride_bytes, + Counter* histogram[ActiveChannels], + unsigned int levels[ActiveChannels], + Level* level_values[ActiveChannels], + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::histogram_range_impl( - temporary_storage, storage_size, - samples, columns, rows, row_stride_bytes, - histogram, - levels, level_values, - stream, debug_synchronous - ); + return detail::histogram_range_impl(temporary_storage, + storage_size, + samples, + columns, + rows, + row_stride_bytes, + histogram, + levels, + level_values, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_histogram_config.hpp b/rocprim/include/rocprim/device/device_histogram_config.hpp index 17180d810..39b51a345 100644 --- a/rocprim/include/rocprim/device/device_histogram_config.hpp +++ b/rocprim/include/rocprim/device/device_histogram_config.hpp @@ -40,63 +40,62 @@ BEGIN_ROCPRIM_NAMESPACE /// \tparam SharedImplMaxBins - maximum total number of bins for all active channels /// for the shared memory histogram implementation (samples -> shared memory bins -> global memory bins), /// when exceeded the global memory implementation is used (samples -> global memory bins). -template< - class HistogramConfig, - unsigned int MaxGridSize = 1024, - unsigned int SharedImplMaxBins = 2048 -> +template struct histogram_config { #ifndef DOXYGEN_SHOULD_SKIP_THIS using histogram = HistogramConfig; - static constexpr unsigned int max_grid_size = MaxGridSize; + static constexpr unsigned int max_grid_size = MaxGridSize; static constexpr unsigned int shared_impl_max_bins = SharedImplMaxBins; #endif }; #ifndef DOXYGEN_SHOULD_SKIP_THIS -template< - class HistogramConfig, - unsigned int MaxGridSize, - unsigned int SharedImplMaxBins -> constexpr unsigned int -histogram_config::max_grid_size; -template< - class HistogramConfig, - unsigned int MaxGridSize, - unsigned int SharedImplMaxBins -> constexpr unsigned int -histogram_config::shared_impl_max_bins; +template +constexpr unsigned int + histogram_config::max_grid_size; +template +constexpr unsigned int + histogram_config::shared_impl_max_bins; #endif namespace detail { -template -struct histogram_config_803 -{ - static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int)); - - using type = histogram_config>; -}; - -template -struct histogram_config_900 -{ - static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int)); - - using type = histogram_config>; -}; - -template -struct default_histogram_config - : select_arch< - TargetArch, - select_arch_case<803, histogram_config_803 >, - select_arch_case<900, histogram_config_900 >, - histogram_config_900 - > { }; + template + struct histogram_config_803 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int)); + + using type + = histogram_config>; + }; + + template + struct histogram_config_900 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int)); + + using type + = histogram_config>; + }; + + template + struct default_histogram_config + : select_arch>, + select_arch_case<900, histogram_config_900>, + histogram_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_merge.hpp b/rocprim/include/rocprim/device/device_merge.hpp index b56cb62b9..8dddb5c54 100644 --- a/rocprim/include/rocprim/device/device_merge.hpp +++ b/rocprim/include/rocprim/device/device_merge.hpp @@ -21,14 +21,14 @@ #ifndef ROCPRIM_DEVICE_DEVICE_MERGE_HPP_ #define ROCPRIM_DEVICE_DEVICE_MERGE_HPP_ -#include #include +#include #include "../config.hpp" #include "../detail/various.hpp" -#include "device_merge_config.hpp" #include "detail/device_merge.hpp" +#include "device_merge_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -38,159 +38,171 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class IndexIterator, - class KeysInputIterator1, - class KeysInputIterator2, - class BinaryFunction -> -__global__ -void partition_kernel(IndexIterator index, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - const size_t input1_size, - const size_t input2_size, - const unsigned int spacing, - BinaryFunction compare_function) -{ - partition_kernel_impl( - index, keys_input1, keys_input2, input1_size, input2_size, - spacing, compare_function - ); -} + template + __global__ void partition_kernel(IndexIterator index, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + const size_t input1_size, + const size_t input2_size, + const unsigned int spacing, + BinaryFunction compare_function) + { + partition_kernel_impl( + index, keys_input1, keys_input2, input1_size, input2_size, spacing, compare_function); + } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class IndexIterator, - class KeysInputIterator1, - class KeysInputIterator2, - class KeysOutputIterator, - class ValuesInputIterator1, - class ValuesInputIterator2, - class ValuesOutputIterator, - class BinaryFunction -> -__global__ -void merge_kernel(IndexIterator index, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - KeysOutputIterator keys_output, - ValuesInputIterator1 values_input1, - ValuesInputIterator2 values_input2, - ValuesOutputIterator values_output, - const size_t input1_size, - const size_t input2_size, - BinaryFunction compare_function) -{ - merge_kernel_impl( - index, keys_input1, keys_input2, keys_output, - values_input1, values_input2, values_output, - input1_size, input2_size, compare_function - ); -} + template + __global__ void merge_kernel(IndexIterator index, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + KeysOutputIterator keys_output, + ValuesInputIterator1 values_input1, + ValuesInputIterator2 values_input2, + ValuesOutputIterator values_output, + const size_t input1_size, + const size_t input2_size, + BinaryFunction compare_function) + { + merge_kernel_impl(index, + keys_input1, + keys_input2, + keys_output, + values_input1, + values_input2, + values_output, + input1_size, + input2_size, + compare_function); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - class Config, - class KeysInputIterator1, - class KeysInputIterator2, - class KeysOutputIterator, - class ValuesInputIterator1, - class ValuesInputIterator2, - class ValuesOutputIterator, - class BinaryFunction -> -inline -hipError_t merge_impl(void * temporary_storage, - size_t& storage_size, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - KeysOutputIterator keys_output, - ValuesInputIterator1 values_input1, - ValuesInputIterator2 values_input2, - ValuesOutputIterator values_output, - const size_t input1_size, - const size_t input2_size, - BinaryFunction compare_function, - const hipStream_t stream, - bool debug_synchronous) + template + inline hipError_t merge_impl(void* temporary_storage, + size_t& storage_size, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + KeysOutputIterator keys_output, + ValuesInputIterator1 values_input1, + ValuesInputIterator2 values_input2, + ValuesOutputIterator values_output, + const size_t input1_size, + const size_t input2_size, + BinaryFunction compare_function, + const hipStream_t stream, + bool debug_synchronous) -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; + { + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; - // Get default config if Config is default_config - using config = detail::default_or_custom_config< - Config, - detail::default_merge_config - >; + // Get default config if Config is default_config + using config = detail::default_or_custom_config< + Config, + detail::default_merge_config>; - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int half_block = block_size / 2; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + constexpr unsigned int block_size = config::block_size; + constexpr unsigned int half_block = block_size / 2; + constexpr unsigned int items_per_thread = config::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; - const unsigned int partitions = ((input1_size + input2_size) + items_per_block - 1) / items_per_block; - const size_t partition_bytes = (partitions + 1) * sizeof(unsigned int); + const unsigned int partitions + = ((input1_size + input2_size) + items_per_block - 1) / items_per_block; + const size_t partition_bytes = (partitions + 1) * sizeof(unsigned int); - if(temporary_storage == nullptr) - { - // storage_size is never zero - storage_size = partition_bytes; - return hipSuccess; - } + if(temporary_storage == nullptr) + { + // storage_size is never zero + storage_size = partition_bytes; + return hipSuccess; + } - // Start point for time measurements - std::chrono::high_resolution_clock::time_point start; + // Start point for time measurements + std::chrono::high_resolution_clock::time_point start; - auto number_of_blocks = partitions; - if(debug_synchronous) - { - std::cout << "block_size " << block_size << '\n'; - std::cout << "number of blocks " << number_of_blocks << '\n'; - std::cout << "items_per_block " << items_per_block << '\n'; - } + auto number_of_blocks = partitions; + if(debug_synchronous) + { + std::cout << "block_size " << block_size << '\n'; + std::cout << "number of blocks " << number_of_blocks << '\n'; + std::cout << "items_per_block " << items_per_block << '\n'; + } - unsigned int * index = reinterpret_cast(temporary_storage); + unsigned int* index = reinterpret_cast(temporary_storage); - const unsigned partition_blocks = ((partitions + 1) + half_block - 1) / half_block; + const unsigned partition_blocks = ((partitions + 1) + half_block - 1) / half_block; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::partition_kernel), - dim3(partition_blocks), dim3(half_block), 0, stream, - index, keys_input1, keys_input2, input1_size, input2_size, - items_per_block, compare_function - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", input1_size, start); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(detail::partition_kernel), + dim3(partition_blocks), + dim3(half_block), + 0, + stream, + index, + keys_input1, + keys_input2, + input1_size, + input2_size, + items_per_block, + compare_function); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", input1_size, start); - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::merge_kernel), - dim3(number_of_blocks), dim3(block_size), 0, stream, - index, keys_input1, keys_input2, keys_output, - values_input1, values_input2, values_output, - input1_size, input2_size, compare_function - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("merge_kernel", input1_size, start); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(detail::merge_kernel), + dim3(number_of_blocks), + dim3(block_size), + 0, + stream, + index, + keys_input1, + keys_input2, + keys_output, + values_input1, + values_input2, + values_output, + input1_size, + input2_size, + compare_function); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("merge_kernel", input1_size, start); - return hipSuccess; -} + return hipSuccess; + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR @@ -271,33 +283,37 @@ hipError_t merge_impl(void * temporary_storage, /// // output: [0, 0, 1, 1, 2, 2, 3, 3] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator1, - class InputIterator2, - class OutputIterator, - class BinaryFunction = ::rocprim::less::value_type> -> -inline -hipError_t merge(void * temporary_storage, - size_t& storage_size, - InputIterator1 input1, - InputIterator2 input2, - OutputIterator output, - const size_t input1_size, - const size_t input2_size, - BinaryFunction compare_function = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t merge(void* temporary_storage, + size_t& storage_size, + InputIterator1 input1, + InputIterator2 input2, + OutputIterator output, + const size_t input1_size, + const size_t input2_size, + BinaryFunction compare_function = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - return detail::merge_impl( - temporary_storage, storage_size, - input1, input2, output, - values, values, values, - input1_size, input2_size, compare_function, - stream, debug_synchronous - ); + empty_type* values = nullptr; + return detail::merge_impl(temporary_storage, + storage_size, + input1, + input2, + output, + values, + values, + values, + input1_size, + input2_size, + compare_function, + stream, + debug_synchronous); } /// \brief Parallel merge primitive for device level. @@ -393,38 +409,42 @@ hipError_t merge(void * temporary_storage, /// // values_output: [10, 20, 11, 21, 12, 22, 13, 23] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator1, - class KeysInputIterator2, - class KeysOutputIterator, - class ValuesInputIterator1, - class ValuesInputIterator2, - class ValuesOutputIterator, - class BinaryFunction = ::rocprim::less::value_type> -> -inline -hipError_t merge(void * temporary_storage, - size_t& storage_size, - KeysInputIterator1 keys_input1, - KeysInputIterator2 keys_input2, - KeysOutputIterator keys_output, - ValuesInputIterator1 values_input1, - ValuesInputIterator2 values_input2, - ValuesOutputIterator values_output, - const size_t input1_size, - const size_t input2_size, - BinaryFunction compare_function = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t merge(void* temporary_storage, + size_t& storage_size, + KeysInputIterator1 keys_input1, + KeysInputIterator2 keys_input2, + KeysOutputIterator keys_output, + ValuesInputIterator1 values_input1, + ValuesInputIterator2 values_input2, + ValuesOutputIterator values_output, + const size_t input1_size, + const size_t input2_size, + BinaryFunction compare_function = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::merge_impl( - temporary_storage, storage_size, - keys_input1, keys_input2, keys_output, - values_input1, values_input2, values_output, - input1_size, input2_size, compare_function, - stream, debug_synchronous - ); + return detail::merge_impl(temporary_storage, + storage_size, + keys_input1, + keys_input2, + keys_output, + values_input1, + values_input2, + values_output, + input1_size, + input2_size, + compare_function, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_merge_config.hpp b/rocprim/include/rocprim/device/device_merge_config.hpp index 37bebb168..efee5e233 100644 --- a/rocprim/include/rocprim/device/device_merge_config.hpp +++ b/rocprim/include/rocprim/device/device_merge_config.hpp @@ -34,68 +34,63 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Configuration of device-level merge primitives. -template +template using merge_config = kernel_config; namespace detail { -template -struct merge_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); - - // TODO Tune when merge-by-key is ready - using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>; -}; - -template -struct merge_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); - - using type = select_type< - select_type_case >, - select_type_case >, - select_type_case >, - merge_config<256, ::rocprim::max(1u, 10u / item_scale)> - >; -}; - -template -struct merge_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); - - // TODO Tune when merge-by-key is ready - using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>; -}; - -template -struct merge_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); - - using type = select_type< - select_type_case >, - select_type_case >, - select_type_case >, - merge_config<256, ::rocprim::max(1u, 10u / item_scale)> - >; -}; - -template -struct default_merge_config - : select_arch< - TargetArch, - select_arch_case<803, merge_config_803>, - select_arch_case<900, merge_config_900>, - merge_config_900 - > { }; + template + struct merge_config_803 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + ::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); + + // TODO Tune when merge-by-key is ready + using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>; + }; + + template + struct merge_config_803 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); + + using type = select_type>, + select_type_case>, + select_type_case>, + merge_config<256, ::rocprim::max(1u, 10u / item_scale)>>; + }; + + template + struct merge_config_900 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + ::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); + + // TODO Tune when merge-by-key is ready + using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>; + }; + + template + struct merge_config_900 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Key), sizeof(int)); + + using type = select_type>, + select_type_case>, + select_type_case>, + merge_config<256, ::rocprim::max(1u, 10u / item_scale)>>; + }; + + template + struct default_merge_config : select_arch>, + select_arch_case<900, merge_config_900>, + merge_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_merge_sort.hpp b/rocprim/include/rocprim/device/device_merge_sort.hpp index f23c584de..e22aef5e2 100644 --- a/rocprim/include/rocprim/device/device_merge_sort.hpp +++ b/rocprim/include/rocprim/device/device_merge_sort.hpp @@ -21,15 +21,15 @@ #ifndef ROCPRIM_DEVICE_DEVICE_SORT_HPP_ #define ROCPRIM_DEVICE_DEVICE_SORT_HPP_ -#include #include +#include #include "../config.hpp" #include "../detail/various.hpp" #include "detail/device_merge_sort.hpp" -#include "device_transform.hpp" #include "device_merge_sort_config.hpp" +#include "device_transform.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -39,198 +39,217 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - unsigned int BlockSize, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class BinaryFunction -> -__global__ -void block_sort_kernel(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const size_t size, - BinaryFunction compare_function) -{ - block_sort_kernel_impl( - keys_input, keys_output, values_input, values_output, - size, compare_function - ); -} + template + __global__ void block_sort_kernel(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const size_t size, + BinaryFunction compare_function) + { + block_sort_kernel_impl( + keys_input, keys_output, values_input, values_output, size, compare_function); + } -template< - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class BinaryFunction -> -__global__ -void block_merge_kernel(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const size_t size, - unsigned int block_size, - BinaryFunction compare_function) -{ - block_merge_kernel_impl( - keys_input, keys_output, values_input, values_output, - size, block_size, compare_function - ); -} + template + __global__ void block_merge_kernel(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const size_t size, + unsigned int block_size, + BinaryFunction compare_function) + { + block_merge_kernel_impl(keys_input, + keys_output, + values_input, + values_output, + size, + block_size, + compare_function); + } -#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ +#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - class Config, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class BinaryFunction -> -inline -hipError_t merge_sort_impl(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const size_t size, - BinaryFunction compare_function, - const hipStream_t stream, - bool debug_synchronous) -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; - constexpr bool with_values = !std::is_same::value; + template + inline hipError_t merge_sort_impl(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const size_t size, + BinaryFunction compare_function, + const hipStream_t stream, + bool debug_synchronous) + { + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + constexpr bool with_values = !std::is_same::value; - // Get default config if Config is default_config - using config = default_or_custom_config< - Config, - default_merge_sort_config - >; + // Get default config if Config is default_config + using config = default_or_custom_config< + Config, + default_merge_sort_config>; - // Block size - constexpr unsigned int block_size = config::block_size; + // Block size + constexpr unsigned int block_size = config::block_size; - const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type)); - const size_t values_bytes = - with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0; - if(temporary_storage == nullptr) - { - storage_size = keys_bytes; - if(with_values) + const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type)); + const size_t values_bytes + = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0; + if(temporary_storage == nullptr) { - storage_size += values_bytes; + storage_size = keys_bytes; + if(with_values) + { + storage_size += values_bytes; + } + // Make sure user won't try to allocate 0 bytes memory + storage_size = storage_size == 0 ? 4 : storage_size; + return hipSuccess; } - // Make sure user won't try to allocate 0 bytes memory - storage_size = storage_size == 0 ? 4 : storage_size; - return hipSuccess; - } - auto number_of_blocks = (size + block_size - 1)/block_size; - if(debug_synchronous) - { - std::cout << "block_size " << block_size << '\n'; - std::cout << "number of blocks " << number_of_blocks << '\n'; - } + auto number_of_blocks = (size + block_size - 1) / block_size; + if(debug_synchronous) + { + std::cout << "block_size " << block_size << '\n'; + std::cout << "number of blocks " << number_of_blocks << '\n'; + } - char* ptr = reinterpret_cast(temporary_storage); - key_type * keys_buffer = reinterpret_cast(ptr); - ptr += keys_bytes; - value_type * values_buffer = - with_values ? reinterpret_cast(ptr) : nullptr; + char* ptr = reinterpret_cast(temporary_storage); + key_type* keys_buffer = reinterpret_cast(ptr); + ptr += keys_bytes; + value_type* values_buffer = with_values ? reinterpret_cast(ptr) : nullptr; - // Start point for time measurements - std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); + // Start point for time measurements + std::chrono::high_resolution_clock::time_point start; + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); - const unsigned int grid_size = number_of_blocks; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_sort_kernel), - dim3(grid_size), dim3(block_size), 0, stream, - keys_input, keys_output, values_input, values_output, - size, compare_function - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_sort_kernel", size, start); + const unsigned int grid_size = number_of_blocks; + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_sort_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + keys_input, + keys_output, + values_input, + values_output, + size, + compare_function); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_sort_kernel", size, start); - bool temporary_store = false; - for(unsigned int block = block_size; block < size; block *= 2) - { - temporary_store = !temporary_store; - if(temporary_store) - { - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_merge_kernel), - dim3(grid_size), dim3(block_size), 0, stream, - keys_output, keys_buffer, values_output, values_buffer, - size, block, compare_function - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start); - } - else + bool temporary_store = false; + for(unsigned int block = block_size; block < size; block *= 2) { - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(block_merge_kernel), - dim3(grid_size), dim3(block_size), 0, stream, - keys_buffer, keys_output, values_buffer, values_output, - size, block, compare_function - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start); + temporary_store = !temporary_store; + if(temporary_store) + { + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_merge_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + keys_output, + keys_buffer, + values_output, + values_buffer, + size, + block, + compare_function); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start); + } + else + { + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_merge_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + keys_buffer, + keys_output, + values_buffer, + values_output, + size, + block, + compare_function); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start); + } } - } - if(temporary_store) - { - hipError_t error = ::rocprim::transform( - keys_buffer, keys_output, size, - ::rocprim::identity(), stream, debug_synchronous - ); - if(error != hipSuccess) return error; - - if(with_values) + if(temporary_store) { - hipError_t error = ::rocprim::transform( - values_buffer, values_output, size, - ::rocprim::identity(), stream, debug_synchronous - ); - if(error != hipSuccess) return error; + hipError_t error = ::rocprim::transform(keys_buffer, + keys_output, + size, + ::rocprim::identity(), + stream, + debug_synchronous); + if(error != hipSuccess) + return error; + + if(with_values) + { + hipError_t error = ::rocprim::transform(values_buffer, + values_output, + size, + ::rocprim::identity(), + stream, + debug_synchronous); + if(error != hipSuccess) + return error; + } } - } - return hipSuccess; -} + return hipSuccess; + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR #undef ROCPRIM_DETAIL_HIP_SYNC @@ -304,28 +323,31 @@ hipError_t merge_sort_impl(void * temporary_storage, /// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class BinaryFunction = ::rocprim::less::value_type> -> -inline -hipError_t merge_sort(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - const size_t size, - BinaryFunction compare_function = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t merge_sort(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + const size_t size, + BinaryFunction compare_function = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - return detail::merge_sort_impl( - temporary_storage, storage_size, - keys_input, keys_output, values, values, size, - compare_function, stream, debug_synchronous - ); + empty_type* values = nullptr; + return detail::merge_sort_impl(temporary_storage, + storage_size, + keys_input, + keys_output, + values, + values, + size, + compare_function, + stream, + debug_synchronous); } /// \brief Parallel ascending merge sort-by-key primitive for device level. @@ -406,31 +428,34 @@ hipError_t merge_sort(void * temporary_storage, /// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class BinaryFunction = ::rocprim::less::value_type> -> -inline -hipError_t merge_sort(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const size_t size, - BinaryFunction compare_function = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t merge_sort(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const size_t size, + BinaryFunction compare_function = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::merge_sort_impl( - temporary_storage, storage_size, - keys_input, keys_output, values_input, values_output, size, - compare_function, stream, debug_synchronous - ); + return detail::merge_sort_impl(temporary_storage, + storage_size, + keys_input, + keys_output, + values_input, + values_output, + size, + compare_function, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_merge_sort_config.hpp b/rocprim/include/rocprim/device/device_merge_sort_config.hpp index 5cccea7d2..a7ccd0e14 100644 --- a/rocprim/include/rocprim/device/device_merge_sort_config.hpp +++ b/rocprim/include/rocprim/device/device_merge_sort_config.hpp @@ -36,44 +36,44 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Configuration of device-level merge primitives. /// /// \tparam BlockSize - block size used in merge sort. -template +template using merge_sort_config = kernel_config; namespace detail { -template -struct merge_sort_config_803 -{ - using type = merge_sort_config::value>; -}; - -template -struct merge_sort_config_803 -{ - using type = merge_sort_config::value>; -}; - -template -struct merge_sort_config_900 -{ - using type = merge_sort_config::value>; -}; - -template -struct merge_sort_config_900 -{ - using type = merge_sort_config::value>; -}; - -template -struct default_merge_sort_config - : select_arch< - TargetArch, - select_arch_case<803, merge_sort_config_803>, - select_arch_case<900, merge_sort_config_900>, - merge_sort_config_900 - > { }; + template + struct merge_sort_config_803 + { + using type = merge_sort_config::value>; + }; + + template + struct merge_sort_config_803 + { + using type = merge_sort_config::value>; + }; + + template + struct merge_sort_config_900 + { + using type = merge_sort_config::value>; + }; + + template + struct merge_sort_config_900 + { + using type = merge_sort_config::value>; + }; + + template + struct default_merge_sort_config + : select_arch>, + select_arch_case<900, merge_sort_config_900>, + merge_sort_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_partition.hpp b/rocprim/include/rocprim/device/device_partition.hpp index db626901e..7f9cb81d3 100644 --- a/rocprim/include/rocprim/device/device_partition.hpp +++ b/rocprim/include/rocprim/device/device_partition.hpp @@ -21,16 +21,16 @@ #ifndef ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_ #define ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_ -#include #include +#include #include "../config.hpp" +#include "../detail/various.hpp" #include "../functional.hpp" #include "../type_traits.hpp" -#include "../detail/various.hpp" -#include "device_select_config.hpp" #include "detail/device_partition.hpp" +#include "device_select_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -40,174 +40,190 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - select_method SelectMethod, - bool OnlySelected, - class Config, - class InputIterator, - class FlagIterator, - class OutputIterator, - class SelectedCountOutputIterator, - class UnaryPredicate, - class InequalityOp, - class OffsetLookbackScanState -> -__global__ -void partition_kernel(InputIterator input, - FlagIterator flags, - OutputIterator output, - SelectedCountOutputIterator selected_count_output, - const size_t size, - UnaryPredicate predicate, - InequalityOp inequality_op, - OffsetLookbackScanState offset_scan_state, - const unsigned int number_of_blocks, - ordered_block_id ordered_bid) -{ - partition_kernel_impl( - input, flags, output, selected_count_output, size, predicate, - inequality_op, offset_scan_state, number_of_blocks, ordered_bid - ); -} + template + __global__ void partition_kernel(InputIterator input, + FlagIterator flags, + OutputIterator output, + SelectedCountOutputIterator selected_count_output, + const size_t size, + UnaryPredicate predicate, + InequalityOp inequality_op, + OffsetLookbackScanState offset_scan_state, + const unsigned int number_of_blocks, + ordered_block_id ordered_bid) + { + partition_kernel_impl(input, + flags, + output, + selected_count_output, + size, + predicate, + inequality_op, + offset_scan_state, + number_of_blocks, + ordered_bid); + } -template -__global__ -void init_offset_scan_state_kernel(OffsetLookBackScanState offset_scan_state, - const unsigned int number_of_blocks, - ordered_block_id ordered_bid) -{ - init_lookback_scan_state_kernel_impl( - offset_scan_state, number_of_blocks, ordered_bid - ); -} + template + __global__ void init_offset_scan_state_kernel(OffsetLookBackScanState offset_scan_state, + const unsigned int number_of_blocks, + ordered_block_id ordered_bid) + { + init_lookback_scan_state_kernel_impl(offset_scan_state, number_of_blocks, ordered_bid); + } -#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ +#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - // Method of selection: flag, predicate, unique - select_method SelectMethod, - // if true, it doesn't copy rejected values to output - bool OnlySelected, - class Config, - class InputIterator, - class FlagIterator, - class OutputIterator, - class UnaryPredicate, - class InequalityOp, - class SelectedCountOutputIterator -> -inline -hipError_t partition_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - FlagIterator flags, - OutputIterator output, - SelectedCountOutputIterator selected_count_output, - const size_t size, - UnaryPredicate predicate, - InequalityOp inequality_op, - const hipStream_t stream, - bool debug_synchronous) -{ - using offset_type = unsigned int; - using input_type = typename std::iterator_traits::value_type; + template < + // Method of selection: flag, predicate, unique + select_method SelectMethod, + // if true, it doesn't copy rejected values to output + bool OnlySelected, + class Config, + class InputIterator, + class FlagIterator, + class OutputIterator, + class UnaryPredicate, + class InequalityOp, + class SelectedCountOutputIterator> + inline hipError_t partition_impl(void* temporary_storage, + size_t& storage_size, + InputIterator input, + FlagIterator flags, + OutputIterator output, + SelectedCountOutputIterator selected_count_output, + const size_t size, + UnaryPredicate predicate, + InequalityOp inequality_op, + const hipStream_t stream, + bool debug_synchronous) + { + using offset_type = unsigned int; + using input_type = typename std::iterator_traits::value_type; - // Get default config if Config is default_config - using config = default_or_custom_config< - Config, - default_select_config - >; + // Get default config if Config is default_config + using config + = default_or_custom_config>; - using offset_scan_state_type = detail::lookback_scan_state; - using ordered_block_id_type = detail::ordered_block_id; + using offset_scan_state_type = detail::lookback_scan_state; + using ordered_block_id_type = detail::ordered_block_id; - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; - const unsigned int number_of_blocks = - std::max(1u, static_cast((size + items_per_block - 1)/items_per_block)); + constexpr unsigned int block_size = config::block_size; + constexpr unsigned int items_per_thread = config::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; + const unsigned int number_of_blocks = std::max( + 1u, static_cast((size + items_per_block - 1) / items_per_block)); - // Calculate required temporary storage - size_t offset_scan_state_bytes = ::rocprim::detail::align_size( - offset_scan_state_type::get_storage_size(number_of_blocks) - ); - size_t ordered_block_id_bytes = ordered_block_id_type::get_storage_size(); - if(temporary_storage == nullptr) - { - // storage_size is never zero - storage_size = offset_scan_state_bytes + ordered_block_id_bytes; - return hipSuccess; - } + // Calculate required temporary storage + size_t offset_scan_state_bytes = ::rocprim::detail::align_size( + offset_scan_state_type::get_storage_size(number_of_blocks)); + size_t ordered_block_id_bytes = ordered_block_id_type::get_storage_size(); + if(temporary_storage == nullptr) + { + // storage_size is never zero + storage_size = offset_scan_state_bytes + ordered_block_id_bytes; + return hipSuccess; + } - // Start point for time measurements - std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) - { - std::cout << "size " << size << '\n'; - std::cout << "block_size " << block_size << '\n'; - std::cout << "number of blocks " << number_of_blocks << '\n'; - std::cout << "items_per_block " << items_per_block << '\n'; - } + // Start point for time measurements + std::chrono::high_resolution_clock::time_point start; + if(debug_synchronous) + { + std::cout << "size " << size << '\n'; + std::cout << "block_size " << block_size << '\n'; + std::cout << "number of blocks " << number_of_blocks << '\n'; + std::cout << "items_per_block " << items_per_block << '\n'; + } - // Create and initialize lookback_scan_state obj - auto offset_scan_state = offset_scan_state_type::create( - temporary_storage, number_of_blocks - ); - // Create ad initialize ordered_block_id obj - auto ptr = reinterpret_cast(temporary_storage); - auto ordered_bid = ordered_block_id_type::create( - reinterpret_cast(ptr + offset_scan_state_bytes) - ); + // Create and initialize lookback_scan_state obj + auto offset_scan_state + = offset_scan_state_type::create(temporary_storage, number_of_blocks); + // Create ad initialize ordered_block_id obj + auto ptr = reinterpret_cast(temporary_storage); + auto ordered_bid = ordered_block_id_type::create( + reinterpret_cast(ptr + offset_scan_state_bytes)); - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - auto grid_size = (number_of_blocks + block_size - 1)/block_size; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(init_offset_scan_state_kernel), - dim3(grid_size), dim3(block_size), 0, stream, - offset_scan_state, number_of_blocks, ordered_bid - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_offset_scan_state_kernel", size, start) + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + auto grid_size = (number_of_blocks + block_size - 1) / block_size; + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_offset_scan_state_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + offset_scan_state, + number_of_blocks, + ordered_bid); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_offset_scan_state_kernel", size, start) - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - grid_size = number_of_blocks; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(partition_kernel< - SelectMethod, OnlySelected, config, - InputIterator, FlagIterator, OutputIterator, SelectedCountOutputIterator, - UnaryPredicate, decltype(inequality_op), offset_scan_state_type - >), - dim3(grid_size), dim3(block_size), 0, stream, - input, flags, output, selected_count_output, size, predicate, - inequality_op, offset_scan_state, number_of_blocks, ordered_bid - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", size, start) + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + grid_size = number_of_blocks; + hipLaunchKernelGGL(HIP_KERNEL_NAME(partition_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + input, + flags, + output, + selected_count_output, + size, + predicate, + inequality_op, + offset_scan_state, + number_of_blocks, + ordered_bid); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", size, start) - return hipSuccess; -} + return hipSuccess; + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR #undef ROCPRIM_DETAIL_HIP_SYNC @@ -293,23 +309,20 @@ hipError_t partition_impl(void * temporary_storage, /// // output_count: 4 /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class FlagIterator, - class OutputIterator, - class SelectedCountOutputIterator -> -inline -hipError_t partition(void * temporary_storage, - size_t& storage_size, - InputIterator input, - FlagIterator flags, - OutputIterator output, - SelectedCountOutputIterator selected_count_output, - const size_t size, - const hipStream_t stream = 0, - const bool debug_synchronous = false) +template +inline hipError_t partition(void* temporary_storage, + size_t& storage_size, + InputIterator input, + FlagIterator flags, + OutputIterator output, + SelectedCountOutputIterator selected_count_output, + const size_t size, + const hipStream_t stream = 0, + const bool debug_synchronous = false) { // Dummy unary predicate using unary_predicate_type = ::rocprim::empty_type; @@ -317,9 +330,17 @@ hipError_t partition(void * temporary_storage, using inequality_op_type = ::rocprim::empty_type; return detail::partition_impl( - temporary_storage, storage_size, input, flags, output, selected_count_output, - size, unary_predicate_type(), inequality_op_type(), stream, debug_synchronous - ); + temporary_storage, + storage_size, + input, + flags, + output, + selected_count_output, + size, + unary_predicate_type(), + inequality_op_type(), + stream, + debug_synchronous); } /// \brief Parallel select primitive for device level using selection predicate. @@ -409,34 +430,39 @@ hipError_t partition(void * temporary_storage, /// // output_count: 4 /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class SelectedCountOutputIterator, - class UnaryPredicate -> -inline -hipError_t partition(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - SelectedCountOutputIterator selected_count_output, - const size_t size, - UnaryPredicate predicate, - const hipStream_t stream = 0, - const bool debug_synchronous = false) +template +inline hipError_t partition(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + SelectedCountOutputIterator selected_count_output, + const size_t size, + UnaryPredicate predicate, + const hipStream_t stream = 0, + const bool debug_synchronous = false) { // Dummy flag type - using flag_type = ::rocprim::empty_type; - flag_type * flags = nullptr; + using flag_type = ::rocprim::empty_type; + flag_type* flags = nullptr; // Dummy inequality operation using inequality_op_type = ::rocprim::empty_type; return detail::partition_impl( - temporary_storage, storage_size, input, flags, output, selected_count_output, - size, predicate, inequality_op_type(), stream, debug_synchronous - ); + temporary_storage, + storage_size, + input, + flags, + output, + selected_count_output, + size, + predicate, + inequality_op_type(), + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_radix_sort.hpp b/rocprim/include/rocprim/device/device_radix_sort.hpp index 83561263f..000c4f494 100644 --- a/rocprim/include/rocprim/device/device_radix_sort.hpp +++ b/rocprim/include/rocprim/device/device_radix_sort.hpp @@ -27,15 +27,15 @@ #include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/radix_sort.hpp" +#include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" -#include "device_radix_sort_config.hpp" #include "detail/device_radix_sort.hpp" +#include "device_radix_sort_config.hpp" /// \addtogroup devicemodule /// @{ @@ -45,439 +45,534 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits, - bool Descending, - class KeysInputIterator -> -__global__ -void fill_digit_counts_kernel(KeysInputIterator keys_input, - unsigned int size, - unsigned int * batch_digit_counts, - unsigned int bit, - unsigned int current_radix_bits, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - fill_digit_counts( - keys_input, size, - batch_digit_counts, - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); -} + template + __global__ void fill_digit_counts_kernel(KeysInputIterator keys_input, + unsigned int size, + unsigned int* batch_digit_counts, + unsigned int bit, + unsigned int current_radix_bits, + unsigned int blocks_per_full_batch, + unsigned int full_batches) + { + fill_digit_counts(keys_input, + size, + batch_digit_counts, + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits -> -__global__ -void scan_batches_kernel(unsigned int * batch_digit_counts, - unsigned int * digit_counts, - unsigned int batches) -{ - scan_batches(batch_digit_counts, digit_counts, batches); -} + template + __global__ void scan_batches_kernel(unsigned int* batch_digit_counts, + unsigned int* digit_counts, + unsigned int batches) + { + scan_batches( + batch_digit_counts, digit_counts, batches); + } -template -__global__ -void scan_digits_kernel(unsigned int * digit_counts) -{ - scan_digits(digit_counts); -} + template + __global__ void scan_digits_kernel(unsigned int* digit_counts) + { + scan_digits(digit_counts); + } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int RadixBits, - bool Descending, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator -> -__global__ -void sort_and_scatter_kernel(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int size, - const unsigned int * batch_digit_starts, - const unsigned int * digit_starts, - unsigned int bit, - unsigned int current_radix_bits, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - sort_and_scatter( - keys_input, keys_output, values_input, values_output, size, - batch_digit_starts, digit_starts, - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); -} + template + __global__ void sort_and_scatter_kernel(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int size, + const unsigned int* batch_digit_starts, + const unsigned int* digit_starts, + unsigned int bit, + unsigned int current_radix_bits, + unsigned int blocks_per_full_batch, + unsigned int full_batches) + { + sort_and_scatter(keys_input, + keys_output, + values_input, + values_output, + size, + batch_digit_starts, + digit_starts, + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - class Config, - unsigned int RadixBits, - bool Descending, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator -> -inline -hipError_t radix_sort_iteration(KeysInputIterator keys_input, - typename std::iterator_traits::value_type * keys_tmp, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - typename std::iterator_traits::value_type * values_tmp, - ValuesOutputIterator values_output, - unsigned int size, - unsigned int * batch_digit_counts, - unsigned int * digit_counts, - bool from_input, - bool to_output, - unsigned int bit, - unsigned int end_bit, - unsigned int blocks_per_full_batch, - unsigned int full_batches, - unsigned int batches, - hipStream_t stream, - bool debug_synchronous) -{ - constexpr unsigned int radix_size = 1 << RadixBits; + template + inline hipError_t radix_sort_iteration( + KeysInputIterator keys_input, + typename std::iterator_traits::value_type* keys_tmp, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + typename std::iterator_traits::value_type* values_tmp, + ValuesOutputIterator values_output, + unsigned int size, + unsigned int* batch_digit_counts, + unsigned int* digit_counts, + bool from_input, + bool to_output, + unsigned int bit, + unsigned int end_bit, + unsigned int blocks_per_full_batch, + unsigned int full_batches, + unsigned int batches, + hipStream_t stream, + bool debug_synchronous) + { + constexpr unsigned int radix_size = 1 << RadixBits; - // Handle cases when (end_bit - bit) is not divisible by RadixBits, i.e. the last - // iteration has a shorter mask. - const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit); + // Handle cases when (end_bit - bit) is not divisible by RadixBits, i.e. the last + // iteration has a shorter mask. + const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit); - std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) - { - std::cout << "RadixBits " << RadixBits << '\n'; - std::cout << "bit " << bit << '\n'; - std::cout << "current_radix_bits " << current_radix_bits << '\n'; - } + if(debug_synchronous) + { + std::cout << "RadixBits " << RadixBits << '\n'; + std::cout << "bit " << bit << '\n'; + std::cout << "current_radix_bits " << current_radix_bits << '\n'; + } - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - if(from_input) - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(fill_digit_counts_kernel< - Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending - >), - dim3(batches), dim3(Config::sort::block_size), 0, stream, - keys_input, size, - batch_digit_counts, - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); - } - else - { - if(to_output) + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + if(from_input) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(fill_digit_counts_kernel< - Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending - >), - dim3(batches), dim3(Config::sort::block_size), 0, stream, - keys_tmp, size, + HIP_KERNEL_NAME(fill_digit_counts_kernel), + dim3(batches), + dim3(Config::sort::block_size), + 0, + stream, + keys_input, + size, batch_digit_counts, - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(fill_digit_counts_kernel< - Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending - >), - dim3(batches), dim3(Config::sort::block_size), 0, stream, - keys_output, size, - batch_digit_counts, - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); + if(to_output) + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(fill_digit_counts_kernel), + dim3(batches), + dim3(Config::sort::block_size), + 0, + stream, + keys_tmp, + size, + batch_digit_counts, + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } + else + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(fill_digit_counts_kernel), + dim3(batches), + dim3(Config::sort::block_size), + 0, + stream, + keys_output, + size, + batch_digit_counts, + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } } - } - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_digit_counts", size, start) + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_digit_counts", size, start) - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(scan_batches_kernel), - dim3(radix_size), dim3(Config::scan::block_size), 0, stream, - batch_digit_counts, digit_counts, batches - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_batches", radix_size * Config::scan::block_size, start) + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(scan_batches_kernel), + dim3(radix_size), + dim3(Config::scan::block_size), + 0, + stream, + batch_digit_counts, + digit_counts, + batches); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR( + "scan_batches", radix_size * Config::scan::block_size, start) - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(scan_digits_kernel), - dim3(1), dim3(radix_size), 0, stream, - digit_counts - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_digits", radix_size, start) + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(scan_digits_kernel), + dim3(1), + dim3(radix_size), + 0, + stream, + digit_counts); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_digits", radix_size, start) - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - if(from_input) - { - if(to_output) + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + if(from_input) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_and_scatter_kernel< - Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending - >), - dim3(batches), dim3(Config::sort::block_size), 0, stream, - keys_input, keys_output, values_input, values_output, size, - const_cast(batch_digit_counts), - const_cast(digit_counts), - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); + if(to_output) + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_and_scatter_kernel), + dim3(batches), + dim3(Config::sort::block_size), + 0, + stream, + keys_input, + keys_output, + values_input, + values_output, + size, + const_cast(batch_digit_counts), + const_cast(digit_counts), + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } + else + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_and_scatter_kernel), + dim3(batches), + dim3(Config::sort::block_size), + 0, + stream, + keys_input, + keys_tmp, + values_input, + values_tmp, + size, + const_cast(batch_digit_counts), + const_cast(digit_counts), + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_and_scatter_kernel< - Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending - >), - dim3(batches), dim3(Config::sort::block_size), 0, stream, - keys_input, keys_tmp, values_input, values_tmp, size, - const_cast(batch_digit_counts), - const_cast(digit_counts), - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); + if(to_output) + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_and_scatter_kernel), + dim3(batches), + dim3(Config::sort::block_size), + 0, + stream, + keys_tmp, + keys_output, + values_tmp, + values_output, + size, + const_cast(batch_digit_counts), + const_cast(digit_counts), + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } + else + { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sort_and_scatter_kernel), + dim3(batches), + dim3(Config::sort::block_size), + 0, + stream, + keys_output, + keys_tmp, + values_output, + values_tmp, + size, + const_cast(batch_digit_counts), + const_cast(digit_counts), + bit, + current_radix_bits, + blocks_per_full_batch, + full_batches); + } } + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("sort_and_scatter", size, start) + + return hipSuccess; } - else + + template + inline hipError_t + radix_sort_impl(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + typename std::iterator_traits::value_type* keys_tmp, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + typename std::iterator_traits::value_type* values_tmp, + ValuesOutputIterator values_output, + unsigned int size, + bool& is_result_in_output, + unsigned int begin_bit, + unsigned int end_bit, + hipStream_t stream, + bool debug_synchronous) { - if(to_output) - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_and_scatter_kernel< - Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending - >), - dim3(batches), dim3(Config::sort::block_size), 0, stream, - keys_tmp, keys_output, values_tmp, values_output, size, - const_cast(batch_digit_counts), - const_cast(digit_counts), - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); - } - else - { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_and_scatter_kernel< - Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending - >), - dim3(batches), dim3(Config::sort::block_size), 0, stream, - keys_output, keys_tmp, values_output, values_tmp, size, - const_cast(batch_digit_counts), - const_cast(digit_counts), - bit, current_radix_bits, - blocks_per_full_batch, full_batches - ); - } - } - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("sort_and_scatter", size, start) + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; - return hipSuccess; -} + static_assert( + std::is_same::value_type>::value, + "KeysInputIterator and KeysOutputIterator must have the same value_type"); + static_assert( + std::is_same::value_type>::value, + "ValuesInputIterator and ValuesOutputIterator must have the same value_type"); -template< - class Config, - bool Descending, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator -> -inline -hipError_t radix_sort_impl(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - typename std::iterator_traits::value_type * keys_tmp, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - typename std::iterator_traits::value_type * values_tmp, - ValuesOutputIterator values_output, - unsigned int size, - bool& is_result_in_output, - unsigned int begin_bit, - unsigned int end_bit, - hipStream_t stream, - bool debug_synchronous) -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; + using config = default_or_custom_config< + Config, + default_radix_sort_config>; - static_assert( - std::is_same::value_type>::value, - "KeysInputIterator and KeysOutputIterator must have the same value_type" - ); - static_assert( - std::is_same::value_type>::value, - "ValuesInputIterator and ValuesOutputIterator must have the same value_type" - ); + constexpr bool with_values = !std::is_same::value; - using config = default_or_custom_config< - Config, - default_radix_sort_config - >; + constexpr unsigned int max_radix_size = 1 << config::long_radix_bits; - constexpr bool with_values = !std::is_same::value; + constexpr unsigned int scan_size + = config::scan::block_size * config::scan::items_per_thread; + constexpr unsigned int sort_size + = config::sort::block_size * config::sort::items_per_thread; - constexpr unsigned int max_radix_size = 1 << config::long_radix_bits; + const unsigned int blocks = std::max(1u, ::rocprim::detail::ceiling_div(size, sort_size)); + const unsigned int blocks_per_full_batch + = ::rocprim::detail::ceiling_div(blocks, scan_size); + const unsigned int full_batches = blocks % scan_size != 0 ? blocks % scan_size : scan_size; + const unsigned int batches = (blocks_per_full_batch == 1 ? full_batches : scan_size); + const bool with_double_buffer = keys_tmp != nullptr; - constexpr unsigned int scan_size = config::scan::block_size * config::scan::items_per_thread; - constexpr unsigned int sort_size = config::sort::block_size * config::sort::items_per_thread; + const unsigned int bits = end_bit - begin_bit; + const unsigned int iterations + = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits); + const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits; + const unsigned int short_iterations + = radix_bits_diff != 0 ? ::rocprim::min( + iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff) + : 0; + const unsigned int long_iterations = iterations - short_iterations; - const unsigned int blocks = std::max(1u, ::rocprim::detail::ceiling_div(size, sort_size)); - const unsigned int blocks_per_full_batch = ::rocprim::detail::ceiling_div(blocks, scan_size); - const unsigned int full_batches = blocks % scan_size != 0 - ? blocks % scan_size - : scan_size; - const unsigned int batches = (blocks_per_full_batch == 1 ? full_batches : scan_size); - const bool with_double_buffer = keys_tmp != nullptr; + const size_t batch_digit_counts_bytes + = ::rocprim::detail::align_size(batches * max_radix_size * sizeof(unsigned int)); + const size_t digit_counts_bytes + = ::rocprim::detail::align_size(max_radix_size * sizeof(unsigned int)); + const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type)); + const size_t values_bytes + = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0; + if(temporary_storage == nullptr) + { + storage_size = batch_digit_counts_bytes + digit_counts_bytes; + if(!with_double_buffer) + { + storage_size += keys_bytes + values_bytes; + } + return hipSuccess; + } - const unsigned int bits = end_bit - begin_bit; - const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits); - const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits; - const unsigned int short_iterations = radix_bits_diff != 0 - ? ::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff) - : 0; - const unsigned int long_iterations = iterations - short_iterations; + if(debug_synchronous) + { + std::cout << "blocks " << blocks << '\n'; + std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n'; + std::cout << "full_batches " << full_batches << '\n'; + std::cout << "batches " << batches << '\n'; + std::cout << "iterations " << iterations << '\n'; + std::cout << "long_iterations " << long_iterations << '\n'; + std::cout << "short_iterations " << short_iterations << '\n'; + hipError_t error = hipStreamSynchronize(stream); + if(error != hipSuccess) + return error; + } - const size_t batch_digit_counts_bytes = - ::rocprim::detail::align_size(batches * max_radix_size * sizeof(unsigned int)); - const size_t digit_counts_bytes = ::rocprim::detail::align_size(max_radix_size * sizeof(unsigned int)); - const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type)); - const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0; - if(temporary_storage == nullptr) - { - storage_size = batch_digit_counts_bytes + digit_counts_bytes; + char* ptr = reinterpret_cast(temporary_storage); + unsigned int* batch_digit_counts = reinterpret_cast(ptr); + ptr += batch_digit_counts_bytes; + unsigned int* digit_counts = reinterpret_cast(ptr); + ptr += digit_counts_bytes; if(!with_double_buffer) { - storage_size += keys_bytes + values_bytes; + keys_tmp = reinterpret_cast(ptr); + ptr += keys_bytes; + values_tmp = with_values ? reinterpret_cast(ptr) : nullptr; } - return hipSuccess; - } - - if(debug_synchronous) - { - std::cout << "blocks " << blocks << '\n'; - std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n'; - std::cout << "full_batches " << full_batches << '\n'; - std::cout << "batches " << batches << '\n'; - std::cout << "iterations " << iterations << '\n'; - std::cout << "long_iterations " << long_iterations << '\n'; - std::cout << "short_iterations " << short_iterations << '\n'; - hipError_t error = hipStreamSynchronize(stream); - if(error != hipSuccess) return error; - } - char * ptr = reinterpret_cast(temporary_storage); - unsigned int * batch_digit_counts = reinterpret_cast(ptr); - ptr += batch_digit_counts_bytes; - unsigned int * digit_counts = reinterpret_cast(ptr); - ptr += digit_counts_bytes; - if(!with_double_buffer) - { - keys_tmp = reinterpret_cast(ptr); - ptr += keys_bytes; - values_tmp = with_values ? reinterpret_cast(ptr) : nullptr; - } - - bool to_output = with_double_buffer || (iterations - 1) % 2 == 0; - bool from_input = true; - if(!with_double_buffer && to_output) - { - // Copy input keys and values if necessary (in-place sorting: input and output iterators are equal) - const bool keys_equal = ::rocprim::detail::are_iterators_equal(keys_input, keys_output); - const bool values_equal = with_values && ::rocprim::detail::are_iterators_equal(values_input, values_output); - if(keys_equal || values_equal) + bool to_output = with_double_buffer || (iterations - 1) % 2 == 0; + bool from_input = true; + if(!with_double_buffer && to_output) { - hipError_t error = ::rocprim::transform( - keys_input, keys_tmp, size, - ::rocprim::identity(), stream, debug_synchronous - ); - if(error != hipSuccess) return error; - - if(with_values) + // Copy input keys and values if necessary (in-place sorting: input and output iterators are equal) + const bool keys_equal = ::rocprim::detail::are_iterators_equal(keys_input, keys_output); + const bool values_equal + = with_values + && ::rocprim::detail::are_iterators_equal(values_input, values_output); + if(keys_equal || values_equal) { - hipError_t error = ::rocprim::transform( - values_input, values_tmp, size, - ::rocprim::identity(), stream, debug_synchronous - ); - if(error != hipSuccess) return error; - } + hipError_t error = ::rocprim::transform(keys_input, + keys_tmp, + size, + ::rocprim::identity(), + stream, + debug_synchronous); + if(error != hipSuccess) + return error; - from_input = false; + if(with_values) + { + hipError_t error = ::rocprim::transform(values_input, + values_tmp, + size, + ::rocprim::identity(), + stream, + debug_synchronous); + if(error != hipSuccess) + return error; + } + + from_input = false; + } } - } - unsigned int bit = begin_bit; - for(unsigned int i = 0; i < long_iterations; i++) - { - hipError_t error = radix_sort_iteration( - keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, size, - batch_digit_counts, digit_counts, - from_input, to_output, - bit, end_bit, - blocks_per_full_batch, full_batches, batches, - stream, debug_synchronous - ); - if(error != hipSuccess) return error; + unsigned int bit = begin_bit; + for(unsigned int i = 0; i < long_iterations; i++) + { + hipError_t error = radix_sort_iteration( + keys_input, + keys_tmp, + keys_output, + values_input, + values_tmp, + values_output, + size, + batch_digit_counts, + digit_counts, + from_input, + to_output, + bit, + end_bit, + blocks_per_full_batch, + full_batches, + batches, + stream, + debug_synchronous); + if(error != hipSuccess) + return error; - is_result_in_output = to_output; - from_input = false; - to_output = !to_output; - bit += config::long_radix_bits; - } - for(unsigned int i = 0; i < short_iterations; i++) - { - hipError_t error = radix_sort_iteration( - keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, size, - batch_digit_counts, digit_counts, - from_input, to_output, - bit, end_bit, - blocks_per_full_batch, full_batches, batches, - stream, debug_synchronous - ); - if(error != hipSuccess) return error; + is_result_in_output = to_output; + from_input = false; + to_output = !to_output; + bit += config::long_radix_bits; + } + for(unsigned int i = 0; i < short_iterations; i++) + { + hipError_t error = radix_sort_iteration( + keys_input, + keys_tmp, + keys_output, + values_input, + values_tmp, + values_output, + size, + batch_digit_counts, + digit_counts, + from_input, + to_output, + bit, + end_bit, + blocks_per_full_batch, + full_batches, + batches, + stream, + debug_synchronous); + if(error != hipSuccess) + return error; - is_result_in_output = to_output; - from_input = false; - to_output = !to_output; - bit += config::short_radix_bits; - } + is_result_in_output = to_output; + from_input = false; + to_output = !to_output; + bit += config::short_radix_bits; + } - return hipSuccess; -} + return hipSuccess; + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR @@ -557,33 +652,36 @@ hipError_t radix_sort_impl(void * temporary_storage, /// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t radix_sort_keys(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t radix_sort_keys(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool ignored; - return detail::radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values, nullptr, values, - size, ignored, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool ignored; + return detail::radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values, + nullptr, + values, + size, + ignored, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel descending radix sort primitive for device level. @@ -660,33 +758,36 @@ hipError_t radix_sort_keys(void * temporary_storage, /// // keys_output: [8, 7, 6, 5, 4, 3, 2, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t radix_sort_keys_desc(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t radix_sort_keys_desc(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool ignored; - return detail::radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values, nullptr, values, - size, ignored, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool ignored; + return detail::radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values, + nullptr, + values, + size, + ignored, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel ascending radix sort-by-key primitive for device level. @@ -779,36 +880,39 @@ hipError_t radix_sort_keys_desc(void * temporary_storage, /// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t radix_sort_pairs(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t radix_sort_pairs(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { bool ignored; - return detail::radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values_input, nullptr, values_output, - size, ignored, - begin_bit, end_bit, - stream, debug_synchronous - ); + return detail::radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values_input, + nullptr, + values_output, + size, + ignored, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel descending radix sort-by-key primitive for device level. @@ -897,36 +1001,39 @@ hipError_t radix_sort_pairs(void * temporary_storage, /// // values_output: [-8, 7, -5, -4, 3, 2, -1, -2] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t radix_sort_pairs_desc(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t radix_sort_pairs_desc(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { bool ignored; - return detail::radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values_input, nullptr, values_output, - size, ignored, - begin_bit, end_bit, - stream, debug_synchronous - ); + return detail::radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values_input, + nullptr, + values_output, + size, + ignored, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel ascending radix sort primitive for device level. @@ -1007,30 +1114,32 @@ hipError_t radix_sort_pairs_desc(void * temporary_storage, /// // keys.current(): [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key -> -inline -hipError_t radix_sort_keys(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t radix_sort_keys(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool is_result_in_output; - hipError_t error = detail::radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values, values, values, - size, is_result_in_output, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool is_result_in_output; + hipError_t error = detail::radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values, + values, + values, + size, + is_result_in_output, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); @@ -1116,30 +1225,32 @@ hipError_t radix_sort_keys(void * temporary_storage, /// // keys.current(): [8, 7, 6, 5, 4, 3, 2, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key -> -inline -hipError_t radix_sort_keys_desc(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t radix_sort_keys_desc(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool is_result_in_output; - hipError_t error = detail::radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values, values, values, - size, is_result_in_output, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool is_result_in_output; + hipError_t error = detail::radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values, + values, + values, + size, + is_result_in_output, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); @@ -1238,31 +1349,32 @@ hipError_t radix_sort_keys_desc(void * temporary_storage, /// // values.current(): [-1, -2, 2, 3, -4, -5, 7, -8] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key, - class Value -> -inline -hipError_t radix_sort_pairs(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - double_buffer& values, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t radix_sort_pairs(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + double_buffer& values, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - bool is_result_in_output; - hipError_t error = detail::radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values.current(), values.current(), values.alternate(), - size, is_result_in_output, - begin_bit, end_bit, - stream, debug_synchronous - ); + bool is_result_in_output; + hipError_t error = detail::radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values.current(), + values.current(), + values.alternate(), + size, + is_result_in_output, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); @@ -1356,31 +1468,32 @@ hipError_t radix_sort_pairs(void * temporary_storage, /// // values.current(): [-8, 7, -5, -4, 3, 2, -1, -2] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key, - class Value -> -inline -hipError_t radix_sort_pairs_desc(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - double_buffer& values, - unsigned int size, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t radix_sort_pairs_desc(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + double_buffer& values, + unsigned int size, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - bool is_result_in_output; - hipError_t error = detail::radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values.current(), values.current(), values.alternate(), - size, is_result_in_output, - begin_bit, end_bit, - stream, debug_synchronous - ); + bool is_result_in_output; + hipError_t error = detail::radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values.current(), + values.current(), + values.alternate(), + size, + is_result_in_output, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); diff --git a/rocprim/include/rocprim/device/device_radix_sort_config.hpp b/rocprim/include/rocprim/device/device_radix_sort_config.hpp index f2134d8aa..95c9eba67 100644 --- a/rocprim/include/rocprim/device/device_radix_sort_config.hpp +++ b/rocprim/include/rocprim/device/device_radix_sort_config.hpp @@ -46,12 +46,10 @@ BEGIN_ROCPRIM_NAMESPACE /// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits. /// \tparam ScanConfig - configuration of digits scan kernel. Must be \p kernel_config. /// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config. -template< - unsigned int LongRadixBits, - unsigned int ShortRadixBits, - class ScanConfig, - class SortConfig -> +template struct radix_sort_config { /// \brief Number of bits in long iterations. @@ -67,102 +65,96 @@ struct radix_sort_config namespace detail { -template -struct radix_sort_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); - - using scan = kernel_config<256, 2>; - - using type = select_type< - select_type_case< - (sizeof(Key) == 1 && sizeof(Value) <= 8), - radix_sort_config<8, 7, scan, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 2 && sizeof(Value) <= 8), - radix_sort_config<8, 7, scan, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 4 && sizeof(Value) <= 8), - radix_sort_config<7, 6, scan, kernel_config<256, 15> > - >, - select_type_case< - (sizeof(Key) == 8 && sizeof(Value) <= 8), - radix_sort_config<7, 6, scan, kernel_config<256, 13> > - >, - radix_sort_config< - 6, 4, scan, - kernel_config< - limit_block_size<256U, sizeof(Value)>::value, - ::rocprim::max(1u, 15u / item_scale) - > - > - >; -}; - -template -struct radix_sort_config_803 - : select_type< - select_type_case, kernel_config<256, 10> > >, - select_type_case, kernel_config<256, 10> > >, - select_type_case, kernel_config<256, 9> > >, - select_type_case, kernel_config<256, 7> > > - > { }; - -template -struct radix_sort_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); - - using scan = kernel_config<256, 2>; - - using type = select_type< - select_type_case< - (sizeof(Key) == 1 && sizeof(Value) <= 8), - radix_sort_config<4, 4, scan, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 2 && sizeof(Value) <= 8), - radix_sort_config<6, 5, scan, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 4 && sizeof(Value) <= 8), - radix_sort_config<7, 6, scan, kernel_config<256, 15> > - >, - select_type_case< - (sizeof(Key) == 8 && sizeof(Value) <= 8), - radix_sort_config<7, 6, scan, kernel_config<256, 15> > - >, - radix_sort_config< - 6, 4, scan, - kernel_config< - limit_block_size<256U, sizeof(Value)>::value, - ::rocprim::max(1u, 15u / item_scale) - > - > - >; -}; - -template -struct radix_sort_config_900 - : select_type< - select_type_case, kernel_config<256, 10> > >, - select_type_case, kernel_config<256, 10> > >, - select_type_case, kernel_config<256, 17> > >, - select_type_case, kernel_config<256, 15> > > - > { }; - -template -struct default_radix_sort_config - : select_arch< - TargetArch, - select_arch_case<803, radix_sort_config_803 >, - select_arch_case<900, radix_sort_config_900 >, - radix_sort_config_900 - > { }; + template + struct radix_sort_config_803 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + ::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); + + using scan = kernel_config<256, 2>; + + using type = select_type< + select_type_case<(sizeof(Key) == 1 && sizeof(Value) <= 8), + radix_sort_config<8, 7, scan, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 2 && sizeof(Value) <= 8), + radix_sort_config<8, 7, scan, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 4 && sizeof(Value) <= 8), + radix_sort_config<7, 6, scan, kernel_config<256, 15>>>, + select_type_case<(sizeof(Key) == 8 && sizeof(Value) <= 8), + radix_sort_config<7, 6, scan, kernel_config<256, 13>>>, + radix_sort_config<6, + 4, + scan, + kernel_config::value, + ::rocprim::max(1u, 15u / item_scale)>>>; + }; + + template + struct radix_sort_config_803 + : select_type, kernel_config<256, 10>>>, + select_type_case< + sizeof(Key) == 2, + radix_sort_config<8, 7, kernel_config<256, 2>, kernel_config<256, 10>>>, + select_type_case< + sizeof(Key) == 4, + radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 9>>>, + select_type_case< + sizeof(Key) == 8, + radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 7>>>> + { + }; + + template + struct radix_sort_config_900 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + ::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); + + using scan = kernel_config<256, 2>; + + using type = select_type< + select_type_case<(sizeof(Key) == 1 && sizeof(Value) <= 8), + radix_sort_config<4, 4, scan, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 2 && sizeof(Value) <= 8), + radix_sort_config<6, 5, scan, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 4 && sizeof(Value) <= 8), + radix_sort_config<7, 6, scan, kernel_config<256, 15>>>, + select_type_case<(sizeof(Key) == 8 && sizeof(Value) <= 8), + radix_sort_config<7, 6, scan, kernel_config<256, 15>>>, + radix_sort_config<6, + 4, + scan, + kernel_config::value, + ::rocprim::max(1u, 15u / item_scale)>>>; + }; + + template + struct radix_sort_config_900 + : select_type, kernel_config<256, 10>>>, + select_type_case< + sizeof(Key) == 2, + radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10>>>, + select_type_case< + sizeof(Key) == 4, + radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 17>>>, + select_type_case< + sizeof(Key) == 8, + radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 15>>>> + { + }; + + template + struct default_radix_sort_config + : select_arch>, + select_arch_case<900, radix_sort_config_900>, + radix_sort_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_reduce.hpp b/rocprim/include/rocprim/device/device_reduce.hpp index dddec39f2..31a114693 100644 --- a/rocprim/include/rocprim/device/device_reduce.hpp +++ b/rocprim/include/rocprim/device/device_reduce.hpp @@ -21,15 +21,15 @@ #ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_ #define ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_ -#include #include +#include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/match_result_type.hpp" +#include "../detail/various.hpp" -#include "device_reduce_config.hpp" #include "detail/device_reduce.hpp" +#include "device_reduce_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -39,152 +39,159 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - bool WithInitialValue, - class Config, - class ResultType, - class InputIterator, - class OutputIterator, - class InitValueType, - class BinaryFunction -> -__global__ -void block_reduce_kernel(InputIterator input, - const size_t size, - OutputIterator output, - InitValueType initial_value, - BinaryFunction reduce_op) -{ - block_reduce_kernel_impl( - input, size, output, initial_value, reduce_op - ); -} + template + __global__ void block_reduce_kernel(InputIterator input, + const size_t size, + OutputIterator output, + InitValueType initial_value, + BinaryFunction reduce_op) + { + block_reduce_kernel_impl( + input, size, output, initial_value, reduce_op); + } -#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ +#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } + template + inline hipError_t reduce_impl(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction reduce_op, + const hipStream_t stream, + bool debug_synchronous) + { + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; -template< - bool WithInitialValue, // true when inital_value should be used in reduction - class Config, - class InputIterator, - class OutputIterator, - class InitValueType, - class BinaryFunction -> -inline -hipError_t reduce_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction reduce_op, - const hipStream_t stream, - bool debug_synchronous) -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + // Get default config if Config is default_config + using config + = default_or_custom_config>; - // Get default config if Config is default_config - using config = default_or_custom_config< - Config, - default_reduce_config - >; + constexpr unsigned int block_size = config::block_size; + constexpr unsigned int items_per_thread = config::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + if(temporary_storage == nullptr) + { + storage_size = reduce_get_temporary_storage_bytes(size, items_per_block); + // Make sure user won't try to allocate 0 bytes memory + storage_size = storage_size == 0 ? 4 : storage_size; + return hipSuccess; + } - if(temporary_storage == nullptr) - { - storage_size = reduce_get_temporary_storage_bytes(size, items_per_block); - // Make sure user won't try to allocate 0 bytes memory - storage_size = storage_size == 0 ? 4 : storage_size; - return hipSuccess; - } + // Start point for time measurements + std::chrono::high_resolution_clock::time_point start; - // Start point for time measurements - std::chrono::high_resolution_clock::time_point start; + auto number_of_blocks = (size + items_per_block - 1) / items_per_block; + if(debug_synchronous) + { + std::cout << "block_size " << block_size << '\n'; + std::cout << "number of blocks " << number_of_blocks << '\n'; + std::cout << "items_per_block " << items_per_block << '\n'; + } - auto number_of_blocks = (size + items_per_block - 1)/items_per_block; - if(debug_synchronous) - { - std::cout << "block_size " << block_size << '\n'; - std::cout << "number of blocks " << number_of_blocks << '\n'; - std::cout << "items_per_block " << items_per_block << '\n'; - } + if(number_of_blocks > 1) + { + // Pointer to array with block_prefixes + result_type* block_prefixes = static_cast(temporary_storage); - if(number_of_blocks > 1) - { - // Pointer to array with block_prefixes - result_type * block_prefixes = static_cast(temporary_storage); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(detail::block_reduce_kernel), + dim3(number_of_blocks), + dim3(block_size), + 0, + stream, + input, + size, + block_prefixes, + initial_value, + reduce_op); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start); - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::block_reduce_kernel), - dim3(number_of_blocks), dim3(block_size), 0, stream, - input, size, block_prefixes, initial_value, reduce_op - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start); + void* nested_temp_storage = static_cast(block_prefixes + number_of_blocks); + auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type)); - void * nested_temp_storage = static_cast(block_prefixes + number_of_blocks); - auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type)); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + auto error = reduce_impl(nested_temp_storage, + nested_temp_storage_size, + block_prefixes, // input + output, // output + initial_value, + number_of_blocks, // input size + reduce_op, + stream, + debug_synchronous); + if(error != hipSuccess) + return error; + ROCPRIM_DETAIL_HIP_SYNC("nested_device_reduce", number_of_blocks, start); + } + else + { + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(detail::block_reduce_kernel), + dim3(1), + dim3(block_size), + 0, + stream, + input, + size, + output, + initial_value, + reduce_op); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start); + } - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - auto error = reduce_impl( - nested_temp_storage, - nested_temp_storage_size, - block_prefixes, // input - output, // output - initial_value, - number_of_blocks, // input size - reduce_op, - stream, - debug_synchronous - ); - if(error != hipSuccess) return error; - ROCPRIM_DETAIL_HIP_SYNC("nested_device_reduce", number_of_blocks, start); - } - else - { - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::block_reduce_kernel), - dim3(1), dim3(block_size), 0, stream, - input, size, output, initial_value, reduce_op - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start); + return hipSuccess; } - return hipSuccess; -} - #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR #undef ROCPRIM_DETAIL_HIP_SYNC @@ -275,29 +282,31 @@ hipError_t reduce_impl(void * temporary_storage, /// // output: [1] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class InitValueType, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t reduce(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction reduce_op = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t reduce(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction reduce_op = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::reduce_impl( - temporary_storage, storage_size, - input, output, initial_value, size, - reduce_op, stream, debug_synchronous - ); + return detail::reduce_impl(temporary_storage, + storage_size, + input, + output, + initial_value, + size, + reduce_op, + stream, + debug_synchronous); } /// \brief Parallel reduce primitive for device level. @@ -375,29 +384,31 @@ hipError_t reduce(void * temporary_storage, /// // output: [36] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t reduce(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const size_t size, - BinaryFunction reduce_op = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t reduce(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const size_t size, + BinaryFunction reduce_op = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - return detail::reduce_impl( - temporary_storage, storage_size, - input, output, input_type(), size, - reduce_op, stream, debug_synchronous - ); + return detail::reduce_impl(temporary_storage, + storage_size, + input, + output, + input_type(), + size, + reduce_op, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/device_reduce_by_key.hpp index fa4d9d00d..57f96738c 100644 --- a/rocprim/include/rocprim/device/device_reduce_by_key.hpp +++ b/rocprim/include/rocprim/device/device_reduce_by_key.hpp @@ -21,17 +21,17 @@ #ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_ #define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_ -#include #include +#include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/match_result_type.hpp" +#include "../detail/various.hpp" #include "../functional.hpp" -#include "device_reduce_by_key_config.hpp" #include "detail/device_reduce_by_key.hpp" +#include "device_reduce_by_key_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -41,230 +41,251 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class KeysInputIterator, - class KeyCompareFunction -> -__global__ -void fill_unique_counts_kernel(KeysInputIterator keys_input, - unsigned int size, - unsigned int * unique_counts, - KeyCompareFunction key_compare_op, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - fill_unique_counts( - keys_input, size, - unique_counts, - key_compare_op, - blocks_per_full_batch, full_batches - ); -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class UniqueCountOutputIterator -> -__global__ -void scan_unique_counts_kernel(unsigned int * unique_counts, - UniqueCountOutputIterator unique_count_output, - unsigned int batches) -{ - scan_unique_counts(unique_counts, unique_count_output, batches); -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class KeysInputIterator, - class ValuesInputIterator, - class Result, - class UniqueOutputIterator, - class AggregatesOutputIterator, - class KeyCompareFunction, - class BinaryFunction -> -__global__ -void reduce_by_key_kernel(KeysInputIterator keys_input, - ValuesInputIterator values_input, - unsigned int size, - const unsigned int * unique_starts, - carry_out * carry_outs, - Result * leading_aggregates, - UniqueOutputIterator unique_output, - AggregatesOutputIterator aggregates_output, - KeyCompareFunction key_compare_op, - BinaryFunction reduce_op, - unsigned int blocks_per_full_batch, - unsigned int full_batches) -{ - reduce_by_key( - keys_input, values_input, size, - unique_starts, carry_outs, leading_aggregates, - unique_output, aggregates_output, - key_compare_op, reduce_op, - blocks_per_full_batch, full_batches - ); -} - -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class Result, - class AggregatesOutputIterator, - class BinaryFunction -> -__global__ -void scan_and_scatter_carry_outs_kernel(const carry_out * carry_outs, - const Result * leading_aggregates, - AggregatesOutputIterator aggregates_output, - BinaryFunction reduce_op, - unsigned int batches) -{ - scan_and_scatter_carry_outs( - carry_outs, leading_aggregates, aggregates_output, - reduce_op, - batches - ); -} + template + __global__ void fill_unique_counts_kernel(KeysInputIterator keys_input, + unsigned int size, + unsigned int* unique_counts, + KeyCompareFunction key_compare_op, + unsigned int blocks_per_full_batch, + unsigned int full_batches) + { + fill_unique_counts( + keys_input, size, unique_counts, key_compare_op, blocks_per_full_batch, full_batches); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ + template + __global__ void scan_unique_counts_kernel(unsigned int* unique_counts, + UniqueCountOutputIterator unique_count_output, + unsigned int batches) + { + scan_unique_counts(unique_counts, unique_count_output, batches); } -template< - class Config, - class KeysInputIterator, - class ValuesInputIterator, - class UniqueOutputIterator, - class AggregatesOutputIterator, - class UniqueCountOutputIterator, - class BinaryFunction, - class KeyCompareFunction -> -inline -hipError_t reduce_by_key_impl(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - ValuesInputIterator values_input, - const unsigned int size, - UniqueOutputIterator unique_output, - AggregatesOutputIterator aggregates_output, - UniqueCountOutputIterator unique_count_output, - BinaryFunction reduce_op, - KeyCompareFunction key_compare_op, - const hipStream_t stream, - const bool debug_synchronous) -{ - using key_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - typename std::iterator_traits::value_type, - BinaryFunction - >::type; - using carry_out_type = carry_out; - - using config = default_or_custom_config< - Config, - default_reduce_by_key_config - >; - - constexpr unsigned int items_per_block = config::reduce::block_size * config::reduce::items_per_thread; - constexpr unsigned int scan_items_per_block = config::scan::block_size * config::scan::items_per_thread; - - const unsigned int blocks = std::max(1u, ::rocprim::detail::ceiling_div(size, items_per_block)); - const unsigned int blocks_per_full_batch = ::rocprim::detail::ceiling_div(blocks, scan_items_per_block); - const unsigned int full_batches = blocks % scan_items_per_block != 0 - ? blocks % scan_items_per_block - : scan_items_per_block; - const unsigned int batches = (blocks_per_full_batch == 1 ? full_batches : scan_items_per_block); - - const size_t unique_counts_bytes = ::rocprim::detail::align_size(batches * sizeof(unsigned int)); - const size_t carry_outs_bytes = ::rocprim::detail::align_size(batches * sizeof(carry_out_type)); - const size_t leading_aggregates_bytes = ::rocprim::detail::align_size(batches * sizeof(result_type)); - if(temporary_storage == nullptr) + template + __global__ void reduce_by_key_kernel(KeysInputIterator keys_input, + ValuesInputIterator values_input, + unsigned int size, + const unsigned int* unique_starts, + carry_out* carry_outs, + Result* leading_aggregates, + UniqueOutputIterator unique_output, + AggregatesOutputIterator aggregates_output, + KeyCompareFunction key_compare_op, + BinaryFunction reduce_op, + unsigned int blocks_per_full_batch, + unsigned int full_batches) { - storage_size = unique_counts_bytes + carry_outs_bytes + leading_aggregates_bytes; - return hipSuccess; + reduce_by_key(keys_input, + values_input, + size, + unique_starts, + carry_outs, + leading_aggregates, + unique_output, + aggregates_output, + key_compare_op, + reduce_op, + blocks_per_full_batch, + full_batches); } - if(debug_synchronous) + template + __global__ void scan_and_scatter_carry_outs_kernel(const carry_out* carry_outs, + const Result* leading_aggregates, + AggregatesOutputIterator aggregates_output, + BinaryFunction reduce_op, + unsigned int batches) { - std::cout << "blocks " << blocks << '\n'; - std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n'; - std::cout << "full_batches " << full_batches << '\n'; - std::cout << "batches " << batches << '\n'; - std::cout << "storage_size " << storage_size << '\n'; - hipError_t error = hipStreamSynchronize(stream); - if(error != hipSuccess) return error; + scan_and_scatter_carry_outs( + carry_outs, leading_aggregates, aggregates_output, reduce_op, batches); } - char * ptr = reinterpret_cast(temporary_storage); - unsigned int * unique_counts = reinterpret_cast(ptr); - ptr += unique_counts_bytes; - carry_out_type * carry_outs = reinterpret_cast(ptr); - ptr += carry_outs_bytes; - result_type * leading_aggregates = reinterpret_cast(ptr); - - // Start point for time measurements - std::chrono::high_resolution_clock::time_point start; - - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(fill_unique_counts_kernel), - dim3(batches), dim3(config::reduce::block_size), 0, stream, - keys_input, size, unique_counts, key_compare_op, - blocks_per_full_batch, full_batches - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_unique_counts", size, start) - - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(scan_unique_counts_kernel), - dim3(1), dim3(config::scan::block_size), 0, stream, - unique_counts, unique_count_output, - batches - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_unique_counts", config::scan::block_size, start) - - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_by_key_kernel), - dim3(batches), dim3(config::reduce::block_size), 0, stream, - keys_input, values_input, size, - const_cast(unique_counts), carry_outs, leading_aggregates, - unique_output, aggregates_output, - key_compare_op, reduce_op, - blocks_per_full_batch, full_batches - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("reduce_by_key", size, start) - - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(scan_and_scatter_carry_outs_kernel), - dim3(1), dim3(config::scan::block_size), 0, stream, - const_cast(carry_outs), const_cast(leading_aggregates), - aggregates_output, - reduce_op, - batches - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_and_scatter_carry_outs", config::scan::block_size, start) - - return hipSuccess; -} +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ + } + + template + inline hipError_t reduce_by_key_impl(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + ValuesInputIterator values_input, + const unsigned int size, + UniqueOutputIterator unique_output, + AggregatesOutputIterator aggregates_output, + UniqueCountOutputIterator unique_count_output, + BinaryFunction reduce_op, + KeyCompareFunction key_compare_op, + const hipStream_t stream, + const bool debug_synchronous) + { + using key_type = typename std::iterator_traits::value_type; + using result_type = typename ::rocprim::detail::match_result_type< + typename std::iterator_traits::value_type, + BinaryFunction>::type; + using carry_out_type = carry_out; + + using config = default_or_custom_config< + Config, + default_reduce_by_key_config>; + + constexpr unsigned int items_per_block + = config::reduce::block_size * config::reduce::items_per_thread; + constexpr unsigned int scan_items_per_block + = config::scan::block_size * config::scan::items_per_thread; + + const unsigned int blocks + = std::max(1u, ::rocprim::detail::ceiling_div(size, items_per_block)); + const unsigned int blocks_per_full_batch + = ::rocprim::detail::ceiling_div(blocks, scan_items_per_block); + const unsigned int full_batches = blocks % scan_items_per_block != 0 + ? blocks % scan_items_per_block + : scan_items_per_block; + const unsigned int batches + = (blocks_per_full_batch == 1 ? full_batches : scan_items_per_block); + + const size_t unique_counts_bytes + = ::rocprim::detail::align_size(batches * sizeof(unsigned int)); + const size_t carry_outs_bytes + = ::rocprim::detail::align_size(batches * sizeof(carry_out_type)); + const size_t leading_aggregates_bytes + = ::rocprim::detail::align_size(batches * sizeof(result_type)); + if(temporary_storage == nullptr) + { + storage_size = unique_counts_bytes + carry_outs_bytes + leading_aggregates_bytes; + return hipSuccess; + } + + if(debug_synchronous) + { + std::cout << "blocks " << blocks << '\n'; + std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n'; + std::cout << "full_batches " << full_batches << '\n'; + std::cout << "batches " << batches << '\n'; + std::cout << "storage_size " << storage_size << '\n'; + hipError_t error = hipStreamSynchronize(stream); + if(error != hipSuccess) + return error; + } + + char* ptr = reinterpret_cast(temporary_storage); + unsigned int* unique_counts = reinterpret_cast(ptr); + ptr += unique_counts_bytes; + carry_out_type* carry_outs = reinterpret_cast(ptr); + ptr += carry_outs_bytes; + result_type* leading_aggregates = reinterpret_cast(ptr); + + // Start point for time measurements + std::chrono::high_resolution_clock::time_point start; + + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(fill_unique_counts_kernel), + dim3(batches), + dim3(config::reduce::block_size), + 0, + stream, + keys_input, + size, + unique_counts, + key_compare_op, + blocks_per_full_batch, + full_batches); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_unique_counts", size, start) + + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(scan_unique_counts_kernel), + dim3(1), + dim3(config::scan::block_size), + 0, + stream, + unique_counts, + unique_count_output, + batches); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR( + "scan_unique_counts", config::scan::block_size, start) + + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + reduce_by_key_kernel), + dim3(batches), + dim3(config::reduce::block_size), + 0, + stream, + keys_input, + values_input, + size, + const_cast(unique_counts), + carry_outs, + leading_aggregates, + unique_output, + aggregates_output, + key_compare_op, + reduce_op, + blocks_per_full_batch, + full_batches); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("reduce_by_key", size, start) + + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(scan_and_scatter_carry_outs_kernel), + dim3(1), + dim3(config::scan::block_size), + 0, + stream, + const_cast(carry_outs), + const_cast(leading_aggregates), + aggregates_output, + reduce_op, + batches); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR( + "scan_and_scatter_carry_outs", config::scan::block_size, start) + + return hipSuccess; + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR @@ -371,37 +392,41 @@ hipError_t reduce_by_key_impl(void * temporary_storage, /// // unique_count_output: [4] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class ValuesInputIterator, - class UniqueOutputIterator, - class AggregatesOutputIterator, - class UniqueCountOutputIterator, - class BinaryFunction = ::rocprim::plus::value_type>, - class KeyCompareFunction = ::rocprim::equal_to::value_type> -> -inline -hipError_t reduce_by_key(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - ValuesInputIterator values_input, - unsigned int size, - UniqueOutputIterator unique_output, - AggregatesOutputIterator aggregates_output, - UniqueCountOutputIterator unique_count_output, - BinaryFunction reduce_op = BinaryFunction(), - KeyCompareFunction key_compare_op = KeyCompareFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>, + class KeyCompareFunction + = ::rocprim::equal_to::value_type>> +inline hipError_t reduce_by_key(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + ValuesInputIterator values_input, + unsigned int size, + UniqueOutputIterator unique_output, + AggregatesOutputIterator aggregates_output, + UniqueCountOutputIterator unique_count_output, + BinaryFunction reduce_op = BinaryFunction(), + KeyCompareFunction key_compare_op = KeyCompareFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::reduce_by_key_impl( - temporary_storage, storage_size, - keys_input, values_input, size, - unique_output, aggregates_output, unique_count_output, - reduce_op, key_compare_op, - stream, debug_synchronous - ); + return detail::reduce_by_key_impl(temporary_storage, + storage_size, + keys_input, + values_input, + size, + unique_output, + aggregates_output, + unique_count_output, + reduce_op, + key_compare_op, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp b/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp index 8de5e62ba..86a780c69 100644 --- a/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp +++ b/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp @@ -37,10 +37,7 @@ BEGIN_ROCPRIM_NAMESPACE /// /// \tparam ScanConfig - configuration of carry-outs scan kernel. Must be \p kernel_config. /// \tparam ReduceConfig - configuration of the main reduce-by-key kernel. Must be \p kernel_config. -template< - class ScanConfig, - class ReduceConfig -> +template struct reduce_by_key_config { /// \brief Configuration of carry-outs scan kernel. @@ -52,48 +49,42 @@ struct reduce_by_key_config namespace detail { -template -struct reduce_by_key_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Key) + sizeof(Value), 2 * sizeof(int)); - - using scan = kernel_config<256, 4>; - - using type = select_type< - select_type_case< - (sizeof(Key) <= 8 && sizeof(Value) <= 8), - reduce_by_key_config > - >, - reduce_by_key_config > - >; -}; - -template -struct reduce_by_key_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Key) + sizeof(Value), 2 * sizeof(int)); - - using scan = kernel_config<256, 2>; - - using type = select_type< - select_type_case< - (sizeof(Key) <= 8 && sizeof(Value) <= 8), - reduce_by_key_config > - >, - reduce_by_key_config > - >; -}; - -template -struct default_reduce_by_key_config - : select_arch< - TargetArch, - select_arch_case<803, reduce_by_key_config_803 >, - select_arch_case<900, reduce_by_key_config_900 >, - reduce_by_key_config_900 - > { }; + template + struct reduce_by_key_config_803 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + sizeof(Key) + sizeof(Value), 2 * sizeof(int)); + + using scan = kernel_config<256, 4>; + + using type = select_type< + select_type_case<(sizeof(Key) <= 8 && sizeof(Value) <= 8), + reduce_by_key_config>>, + reduce_by_key_config>>; + }; + + template + struct reduce_by_key_config_900 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + sizeof(Key) + sizeof(Value), 2 * sizeof(int)); + + using scan = kernel_config<256, 2>; + + using type = select_type< + select_type_case<(sizeof(Key) <= 8 && sizeof(Value) <= 8), + reduce_by_key_config>>, + reduce_by_key_config>>; + }; + + template + struct default_reduce_by_key_config + : select_arch>, + select_arch_case<900, reduce_by_key_config_900>, + reduce_by_key_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_reduce_config.hpp b/rocprim/include/rocprim/device/device_reduce_config.hpp index 4e39de71b..b37cb19a3 100644 --- a/rocprim/include/rocprim/device/device_reduce_config.hpp +++ b/rocprim/include/rocprim/device/device_reduce_config.hpp @@ -40,11 +40,9 @@ BEGIN_ROCPRIM_NAMESPACE /// \tparam BlockSize - number of threads in a block. /// \tparam ItemsPerThread - number of items processed by each thread. /// \tparam BlockReduceMethod - algorithm for block reduce. -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - ::rocprim::block_reduce_algorithm BlockReduceMethod -> +template struct reduce_config { /// \brief Number of threads in a block. @@ -58,40 +56,35 @@ struct reduce_config namespace detail { -template -struct reduce_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = reduce_config< - 256, - ::rocprim::max(1u, 16u / item_scale), - ::rocprim::block_reduce_algorithm::using_warp_reduce - >; -}; - -template -struct reduce_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = reduce_config< - 256, - ::rocprim::max(1u, 16u / item_scale), - ::rocprim::block_reduce_algorithm::using_warp_reduce - >; -}; - -template -struct default_reduce_config - : select_arch< - TargetArch, - select_arch_case<803, reduce_config_803>, - select_arch_case<900, reduce_config_900>, - reduce_config_900 - > { }; + template + struct reduce_config_803 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = reduce_config<256, + ::rocprim::max(1u, 16u / item_scale), + ::rocprim::block_reduce_algorithm::using_warp_reduce>; + }; + + template + struct reduce_config_900 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = reduce_config<256, + ::rocprim::max(1u, 16u / item_scale), + ::rocprim::block_reduce_algorithm::using_warp_reduce>; + }; + + template + struct default_reduce_config : select_arch>, + select_arch_case<900, reduce_config_900>, + reduce_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_run_length_encode.hpp b/rocprim/include/rocprim/device/device_run_length_encode.hpp index 2ffb297f2..2c4a0b778 100644 --- a/rocprim/include/rocprim/device/device_run_length_encode.hpp +++ b/rocprim/include/rocprim/device/device_run_length_encode.hpp @@ -21,8 +21,8 @@ #ifndef ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_ #define ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_ -#include #include +#include #include "../config.hpp" #include "../detail/various.hpp" @@ -32,8 +32,8 @@ #include "../iterator/discard_iterator.hpp" #include "../iterator/zip_iterator.hpp" -#include "device_run_length_encode_config.hpp" #include "device_reduce_by_key.hpp" +#include "device_run_length_encode_config.hpp" #include "device_select.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -44,18 +44,20 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } } // end detail namespace @@ -140,39 +142,40 @@ namespace detail /// // runs_count_output: [4] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class UniqueOutputIterator, - class CountsOutputIterator, - class RunsCountOutputIterator -> -inline -hipError_t run_length_encode(void * temporary_storage, - size_t& storage_size, - InputIterator input, - unsigned int size, - UniqueOutputIterator unique_output, - CountsOutputIterator counts_output, - RunsCountOutputIterator runs_count_output, - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t run_length_encode(void* temporary_storage, + size_t& storage_size, + InputIterator input, + unsigned int size, + UniqueOutputIterator unique_output, + CountsOutputIterator counts_output, + RunsCountOutputIterator runs_count_output, + hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; using count_type = unsigned int; - using config = detail::default_or_custom_config< - Config, - detail::default_run_length_encode_config - >; + using config + = detail::default_or_custom_config; return ::rocprim::reduce_by_key( - temporary_storage, storage_size, - input, make_constant_iterator(1), size, - unique_output, counts_output, runs_count_output, - ::rocprim::plus(), ::rocprim::equal_to(), - stream, debug_synchronous - ); + temporary_storage, + storage_size, + input, + make_constant_iterator(1), + size, + unique_output, + counts_output, + runs_count_output, + ::rocprim::plus(), + ::rocprim::equal_to(), + stream, + debug_synchronous); } /// \brief Parallel run-length encoding of non-trivial runs for device level. @@ -256,144 +259,143 @@ hipError_t run_length_encode(void * temporary_storage, /// // runs_count_output: [2] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OffsetsOutputIterator, - class CountsOutputIterator, - class RunsCountOutputIterator -> -inline -hipError_t run_length_encode_non_trivial_runs(void * temporary_storage, - size_t& storage_size, - InputIterator input, - unsigned int size, - OffsetsOutputIterator offsets_output, - CountsOutputIterator counts_output, - RunsCountOutputIterator runs_count_output, - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t run_length_encode_non_trivial_runs(void* temporary_storage, + size_t& storage_size, + InputIterator input, + unsigned int size, + OffsetsOutputIterator offsets_output, + CountsOutputIterator counts_output, + RunsCountOutputIterator runs_count_output, + hipStream_t stream = 0, + bool debug_synchronous = false) { - using input_type = typename std::iterator_traits::value_type; - using offset_type = unsigned int; - using count_type = unsigned int; + using input_type = typename std::iterator_traits::value_type; + using offset_type = unsigned int; + using count_type = unsigned int; using offset_count_pair = typename ::rocprim::tuple; - using config = detail::default_or_custom_config< - Config, - detail::default_run_length_encode_config - >; + using config + = detail::default_or_custom_config; hipError_t error; - auto reduce_op = [] __device__ (const offset_count_pair& a, const offset_count_pair& b) - { + auto reduce_op = [] __device__(const offset_count_pair& a, const offset_count_pair& b) { return offset_count_pair( ::rocprim::get<0>(a), // Always use offset of the first item of the run ::rocprim::get<1>(a) + ::rocprim::get<1>(b) // Number of items in the run ); }; - auto non_trivial_runs_select_op = [] __device__ (const offset_count_pair& a) - { - return ::rocprim::get<1>(a) > 1; - }; + auto non_trivial_runs_select_op + = [] __device__(const offset_count_pair& a) { return ::rocprim::get<1>(a) > 1; }; - offset_type * offsets_tmp = nullptr; - count_type * counts_tmp = nullptr; - count_type * all_runs_count_tmp = nullptr; + offset_type* offsets_tmp = nullptr; + count_type* counts_tmp = nullptr; + count_type* all_runs_count_tmp = nullptr; // Calculate size of temporary storage for reduce_by_key operation size_t reduce_by_key_bytes; error = ::rocprim::reduce_by_key( - nullptr, reduce_by_key_bytes, + nullptr, + reduce_by_key_bytes, input, ::rocprim::make_zip_iterator( - ::rocprim::make_tuple( - ::rocprim::make_counting_iterator(0), - ::rocprim::make_constant_iterator(1) - ) - ), + ::rocprim::make_tuple(::rocprim::make_counting_iterator(0), + ::rocprim::make_constant_iterator(1))), size, ::rocprim::make_discard_iterator(), ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)), all_runs_count_tmp, - reduce_op, ::rocprim::equal_to(), - stream, debug_synchronous - ); - if(error != hipSuccess) return error; + reduce_op, + ::rocprim::equal_to(), + stream, + debug_synchronous); + if(error != hipSuccess) + return error; reduce_by_key_bytes = ::rocprim::detail::align_size(reduce_by_key_bytes); // Calculate size of temporary storage for select operation size_t select_bytes; error = ::rocprim::select( - nullptr, select_bytes, + nullptr, + select_bytes, ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)), ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_output, counts_output)), runs_count_output, size, non_trivial_runs_select_op, - stream, debug_synchronous - ); - if(error != hipSuccess) return error; + stream, + debug_synchronous); + if(error != hipSuccess) + return error; select_bytes = ::rocprim::detail::align_size(select_bytes); const size_t offsets_tmp_bytes = ::rocprim::detail::align_size(size * sizeof(offset_type)); - const size_t counts_tmp_bytes = ::rocprim::detail::align_size(size * sizeof(count_type)); + const size_t counts_tmp_bytes = ::rocprim::detail::align_size(size * sizeof(count_type)); const size_t all_runs_count_tmp_bytes = sizeof(count_type); if(temporary_storage == nullptr) { - storage_size = ::rocprim::max(reduce_by_key_bytes, select_bytes) + - offsets_tmp_bytes + counts_tmp_bytes + all_runs_count_tmp_bytes; + storage_size = ::rocprim::max(reduce_by_key_bytes, select_bytes) + offsets_tmp_bytes + + counts_tmp_bytes + all_runs_count_tmp_bytes; return hipSuccess; } - char * ptr = reinterpret_cast(temporary_storage); + char* ptr = reinterpret_cast(temporary_storage); ptr += ::rocprim::max(reduce_by_key_bytes, select_bytes); - offsets_tmp = reinterpret_cast(ptr); + offsets_tmp = reinterpret_cast(ptr); ptr += offsets_tmp_bytes; - counts_tmp = reinterpret_cast(ptr); + counts_tmp = reinterpret_cast(ptr); ptr += counts_tmp_bytes; - all_runs_count_tmp = reinterpret_cast(ptr); + all_runs_count_tmp = reinterpret_cast(ptr); std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); error = ::rocprim::reduce_by_key( - temporary_storage, reduce_by_key_bytes, + temporary_storage, + reduce_by_key_bytes, input, ::rocprim::make_zip_iterator( - ::rocprim::make_tuple( - ::rocprim::make_counting_iterator(0), - ::rocprim::make_constant_iterator(1) - ) - ), + ::rocprim::make_tuple(::rocprim::make_counting_iterator(0), + ::rocprim::make_constant_iterator(1))), size, ::rocprim::make_discard_iterator(), // Ignore unique output ::rocprim::make_zip_iterator(rocprim::make_tuple(offsets_tmp, counts_tmp)), all_runs_count_tmp, - reduce_op, ::rocprim::equal_to(), - stream, debug_synchronous - ); + reduce_op, + ::rocprim::equal_to(), + stream, + debug_synchronous); ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("rocprim::reduce_by_key", size, start) // Read count of all runs (including trivial runs) count_type all_runs_count; - error = hipMemcpyAsync(&all_runs_count, all_runs_count_tmp, sizeof(count_type), hipMemcpyDeviceToHost, stream); - if(error != hipSuccess) return error; + error = hipMemcpyAsync( + &all_runs_count, all_runs_count_tmp, sizeof(count_type), hipMemcpyDeviceToHost, stream); + if(error != hipSuccess) + return error; error = hipStreamSynchronize(stream); - if(error != hipSuccess) return error; + if(error != hipSuccess) + return error; // Select non-trivial runs - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); error = ::rocprim::select( - temporary_storage, select_bytes, + temporary_storage, + select_bytes, ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)), ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_output, counts_output)), runs_count_output, all_runs_count, non_trivial_runs_select_op, - stream, debug_synchronous - ); + stream, + debug_synchronous); ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("rocprim::select", all_runs_count, start) return hipSuccess; diff --git a/rocprim/include/rocprim/device/device_run_length_encode_config.hpp b/rocprim/include/rocprim/device/device_run_length_encode_config.hpp index f66cc8cfc..4ca940f01 100644 --- a/rocprim/include/rocprim/device/device_run_length_encode_config.hpp +++ b/rocprim/include/rocprim/device/device_run_length_encode_config.hpp @@ -39,10 +39,7 @@ BEGIN_ROCPRIM_NAMESPACE /// Must be \p reduce_by_key_config or \p default_config. /// \tparam SelectConfig - configuration of device-level select operation. /// Must be \p select_config or \p default_config. -template< - class ReduceByKeyConfig, - class SelectConfig = default_config -> +template struct run_length_encode_config { /// \brief Configuration of device-level reduce-by-key operation. @@ -54,7 +51,8 @@ struct run_length_encode_config namespace detail { -using default_run_length_encode_config = run_length_encode_config; + using default_run_length_encode_config + = run_length_encode_config; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_scan.hpp b/rocprim/include/rocprim/device/device_scan.hpp index c7a84bf1d..2fcae710d 100644 --- a/rocprim/include/rocprim/device/device_scan.hpp +++ b/rocprim/include/rocprim/device/device_scan.hpp @@ -21,18 +21,18 @@ #ifndef ROCPRIM_DEVICE_DEVICE_SCAN_HPP_ #define ROCPRIM_DEVICE_DEVICE_SCAN_HPP_ -#include #include +#include #include "../config.hpp" +#include "../detail/match_result_type.hpp" +#include "../detail/various.hpp" #include "../functional.hpp" #include "../type_traits.hpp" -#include "../detail/various.hpp" -#include "../detail/match_result_type.hpp" -#include "device_scan_config.hpp" -#include "detail/device_scan_reduce_then_scan.hpp" #include "detail/device_scan_lookback.hpp" +#include "detail/device_scan_reduce_then_scan.hpp" +#include "device_scan_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -42,368 +42,381 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Single kernel scan (performs scan on one thread block only) -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class BinaryFunction, - class ResultType -> -__global__ -void single_scan_kernel(InputIterator input, - const size_t size, - ResultType initial_value, - OutputIterator output, - BinaryFunction scan_op) -{ - single_scan_kernel_impl( - input, size, initial_value, output, scan_op - ); -} - -// Reduce-then-scan kernels - -// Calculates block prefixes that will be used in final_scan_kernel -// when performing block scan operations. -template< - class Config, - class InputIterator, - class BinaryFunction, - class ResultType -> -__global__ -void block_reduce_kernel(InputIterator input, - BinaryFunction scan_op, - ResultType * block_prefixes) -{ - block_reduce_kernel_impl( - input, scan_op, block_prefixes - ); -} - -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class BinaryFunction, - class ResultType -> -__global__ -void final_scan_kernel(InputIterator input, - const size_t size, - OutputIterator output, - const ResultType initial_value, - BinaryFunction scan_op, - ResultType * block_prefixes) -{ - final_scan_kernel_impl( - input, size, output, initial_value, scan_op, block_prefixes - ); -} - -// Single pass (look-back kernels) - -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class BinaryFunction, - class ResultType, - class LookBackScanState -> -__global__ -void lookback_scan_kernel(InputIterator input, - OutputIterator output, - const size_t size, - ResultType initial_value, - BinaryFunction scan_op, - LookBackScanState lookback_scan_state, - const unsigned int number_of_blocks, - ordered_block_id ordered_bid) -{ - lookback_scan_kernel_impl( - input, output, size, initial_value, scan_op, - lookback_scan_state, number_of_blocks, ordered_bid - ); -} - -template -__global__ -void init_lookback_scan_state_kernel(LookBackScanState lookback_scan_state, - const unsigned int number_of_blocks, - ordered_block_id ordered_bid) -{ - init_lookback_scan_state_kernel_impl( - lookback_scan_state, number_of_blocks, ordered_bid - ); -} - -#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } - -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ + // Single kernel scan (performs scan on one thread block only) + template + __global__ void single_scan_kernel(InputIterator input, + const size_t size, + ResultType initial_value, + OutputIterator output, + BinaryFunction scan_op) + { + single_scan_kernel_impl(input, size, initial_value, output, scan_op); } -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class InitValueType, - class BinaryFunction -> -inline -auto scan_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction scan_op, - const hipStream_t stream, - bool debug_synchronous) - -> typename std::enable_if::type -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; - - using config = Config; - - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + // Reduce-then-scan kernels - // Calculate required temporary storage - if(temporary_storage == nullptr) + // Calculates block prefixes that will be used in final_scan_kernel + // when performing block scan operations. + template + __global__ void + block_reduce_kernel(InputIterator input, BinaryFunction scan_op, ResultType* block_prefixes) { - storage_size = scan_get_temporary_storage_bytes(size, items_per_block); - // Make sure user won't try to allocate 0 bytes memory, because - // hipMalloc will return nullptr when size is zero. - storage_size = storage_size == 0 ? 4 : storage_size; - return hipSuccess; + block_reduce_kernel_impl(input, scan_op, block_prefixes); } - // Start point for time measurements - std::chrono::high_resolution_clock::time_point start; - - auto number_of_blocks = (size + items_per_block - 1)/items_per_block; - if(debug_synchronous) + template + __global__ void final_scan_kernel(InputIterator input, + const size_t size, + OutputIterator output, + const ResultType initial_value, + BinaryFunction scan_op, + ResultType* block_prefixes) { - std::cout << "block_size " << block_size << '\n'; - std::cout << "number of blocks " << number_of_blocks << '\n'; - std::cout << "items_per_block " << items_per_block << '\n'; + final_scan_kernel_impl( + input, size, output, initial_value, scan_op, block_prefixes); } - if(number_of_blocks > 1) + // Single pass (look-back kernels) + + template + __global__ void lookback_scan_kernel(InputIterator input, + OutputIterator output, + const size_t size, + ResultType initial_value, + BinaryFunction scan_op, + LookBackScanState lookback_scan_state, + const unsigned int number_of_blocks, + ordered_block_id ordered_bid) { - // Pointer to array with block_prefixes - result_type * block_prefixes = static_cast(temporary_storage); - - // Grid size for block_reduce_kernel, we don't need to calculate reduction - // of the last block as it will never be used as prefix for other blocks - auto grid_size = number_of_blocks - 1; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::block_reduce_kernel< - config, InputIterator, BinaryFunction, result_type - >), - dim3(grid_size), dim3(block_size), 0, stream, - input, scan_op, block_prefixes - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start) - - // TODO: Performance may increase if for (number_of_blocks < 8192) (or some other - // threshold) we would just use CPU to calculate prefixes. - - // Calculate size of temporary storage for nested device scan operation - void * nested_temp_storage = static_cast(block_prefixes + number_of_blocks); - auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type)); - - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - auto error = scan_impl( - nested_temp_storage, - nested_temp_storage_size, - block_prefixes, // input - block_prefixes, // output - result_type(), // dummy initial value - number_of_blocks, // input size - scan_op, - stream, - debug_synchronous - ); - if(error != hipSuccess) return error; - ROCPRIM_DETAIL_HIP_SYNC("nested_device_scan", number_of_blocks, start); - - // Grid size for final_scan_kernel - grid_size = number_of_blocks; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::final_scan_kernel< - Exclusive, // flag for exclusive scan operation - config, // kernel configuration (block size, ipt) - InputIterator, OutputIterator, - BinaryFunction, result_type - >), - dim3(grid_size), dim3(block_size), 0, stream, - input, - size, - output, - static_cast(initial_value), - scan_op, - block_prefixes - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("final_scan_kernel", size, start); + lookback_scan_kernel_impl(input, + output, + size, + initial_value, + scan_op, + lookback_scan_state, + number_of_blocks, + ordered_bid); } - else + + template + __global__ void init_lookback_scan_state_kernel(LookBackScanState lookback_scan_state, + const unsigned int number_of_blocks, + ordered_block_id ordered_bid) { - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::single_scan_kernel< - Exclusive, // flag for exclusive scan operation - config, // kernel configuration (block size, ipt) - InputIterator, OutputIterator, BinaryFunction - >), - dim3(1), dim3(block_size), 0, stream, - input, size, static_cast(initial_value), output, scan_op - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start); + init_lookback_scan_state_kernel_impl(lookback_scan_state, number_of_blocks, ordered_bid); } - return hipSuccess; -} -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class InitValueType, - class BinaryFunction -> -inline -auto scan_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction scan_op, - const hipStream_t stream, - bool debug_synchronous) - -> typename std::enable_if::type -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; - - using config = Config; - - using scan_state_type = detail::lookback_scan_state; - using ordered_block_id_type = detail::ordered_block_id; - - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; - const unsigned int number_of_blocks = (size + items_per_block - 1)/items_per_block; - - // Calculate required temporary storage - size_t scan_state_bytes = ::rocprim::detail::align_size( - scan_state_type::get_storage_size(number_of_blocks) - ); - size_t ordered_block_id_bytes = ordered_block_id_type::get_storage_size(); - if(temporary_storage == nullptr) - { - // storage_size is never zero - storage_size = scan_state_bytes + ordered_block_id_bytes; - return hipSuccess; +#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ } - // Start point for time measurements - std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) - { - std::cout << "size " << size << '\n'; - std::cout << "block_size " << block_size << '\n'; - std::cout << "number of blocks " << number_of_blocks << '\n'; - std::cout << "items_per_block " << items_per_block << '\n'; +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } - if(number_of_blocks > 1) + template + inline auto scan_impl(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction scan_op, + const hipStream_t stream, + bool debug_synchronous) -> + typename std::enable_if::type { - // Create and initialize lookback_scan_state obj - auto scan_state = scan_state_type::create(temporary_storage, number_of_blocks); - // Create ad initialize ordered_block_id obj - auto ptr = reinterpret_cast(temporary_storage); - auto ordered_bid = ordered_block_id_type::create( - reinterpret_cast(ptr + scan_state_bytes) - ); - - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - auto grid_size = (number_of_blocks + block_size - 1)/block_size; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(init_lookback_scan_state_kernel), - dim3(grid_size), dim3(block_size), 0, stream, - scan_state, number_of_blocks, ordered_bid - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_lookback_scan_state_kernel", size, start) - - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - grid_size = number_of_blocks; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(lookback_scan_kernel< - Exclusive, // flag for exclusive scan operation - config, // kernel configuration (block size, ipt) - InputIterator, OutputIterator, - BinaryFunction, result_type, scan_state_type - >), - dim3(grid_size), dim3(block_size), 0, stream, - input, output, size, static_cast(initial_value), - scan_op, scan_state, number_of_blocks, ordered_bid - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("lookback_scan_kernel", size, start) + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; + + using config = Config; + + constexpr unsigned int block_size = config::block_size; + constexpr unsigned int items_per_thread = config::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; + + // Calculate required temporary storage + if(temporary_storage == nullptr) + { + storage_size = scan_get_temporary_storage_bytes(size, items_per_block); + // Make sure user won't try to allocate 0 bytes memory, because + // hipMalloc will return nullptr when size is zero. + storage_size = storage_size == 0 ? 4 : storage_size; + return hipSuccess; + } + + // Start point for time measurements + std::chrono::high_resolution_clock::time_point start; + + auto number_of_blocks = (size + items_per_block - 1) / items_per_block; + if(debug_synchronous) + { + std::cout << "block_size " << block_size << '\n'; + std::cout << "number of blocks " << number_of_blocks << '\n'; + std::cout << "items_per_block " << items_per_block << '\n'; + } + + if(number_of_blocks > 1) + { + // Pointer to array with block_prefixes + result_type* block_prefixes = static_cast(temporary_storage); + + // Grid size for block_reduce_kernel, we don't need to calculate reduction + // of the last block as it will never be used as prefix for other blocks + auto grid_size = number_of_blocks - 1; + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + detail:: + block_reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + input, + scan_op, + block_prefixes); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start) + + // TODO: Performance may increase if for (number_of_blocks < 8192) (or some other + // threshold) we would just use CPU to calculate prefixes. + + // Calculate size of temporary storage for nested device scan operation + void* nested_temp_storage = static_cast(block_prefixes + number_of_blocks); + auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type)); + + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + auto error = scan_impl(nested_temp_storage, + nested_temp_storage_size, + block_prefixes, // input + block_prefixes, // output + result_type(), // dummy initial value + number_of_blocks, // input size + scan_op, + stream, + debug_synchronous); + if(error != hipSuccess) + return error; + ROCPRIM_DETAIL_HIP_SYNC("nested_device_scan", number_of_blocks, start); + + // Grid size for final_scan_kernel + grid_size = number_of_blocks; + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + detail::final_scan_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + input, + size, + output, + static_cast(initial_value), + scan_op, + block_prefixes); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("final_scan_kernel", size, start); + } + else + { + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + detail::single_scan_kernel), + dim3(1), + dim3(block_size), + 0, + stream, + input, + size, + static_cast(initial_value), + output, + scan_op); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start); + } + return hipSuccess; } - else + + template + inline auto scan_impl(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction scan_op, + const hipStream_t stream, + bool debug_synchronous) -> + typename std::enable_if::type { - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(single_scan_kernel< - Exclusive, // flag for exclusive scan operation - config, // kernel configuration (block size, ipt) - InputIterator, OutputIterator, BinaryFunction - >), - dim3(1), dim3(block_size), 0, stream, - input, size, static_cast(initial_value), output, scan_op - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start); + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; + + using config = Config; + + using scan_state_type = detail::lookback_scan_state; + using ordered_block_id_type = detail::ordered_block_id; + + constexpr unsigned int block_size = config::block_size; + constexpr unsigned int items_per_thread = config::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; + const unsigned int number_of_blocks = (size + items_per_block - 1) / items_per_block; + + // Calculate required temporary storage + size_t scan_state_bytes + = ::rocprim::detail::align_size(scan_state_type::get_storage_size(number_of_blocks)); + size_t ordered_block_id_bytes = ordered_block_id_type::get_storage_size(); + if(temporary_storage == nullptr) + { + // storage_size is never zero + storage_size = scan_state_bytes + ordered_block_id_bytes; + return hipSuccess; + } + + // Start point for time measurements + std::chrono::high_resolution_clock::time_point start; + if(debug_synchronous) + { + std::cout << "size " << size << '\n'; + std::cout << "block_size " << block_size << '\n'; + std::cout << "number of blocks " << number_of_blocks << '\n'; + std::cout << "items_per_block " << items_per_block << '\n'; + } + + if(number_of_blocks > 1) + { + // Create and initialize lookback_scan_state obj + auto scan_state = scan_state_type::create(temporary_storage, number_of_blocks); + // Create ad initialize ordered_block_id obj + auto ptr = reinterpret_cast(temporary_storage); + auto ordered_bid = ordered_block_id_type::create( + reinterpret_cast(ptr + scan_state_bytes)); + + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + auto grid_size = (number_of_blocks + block_size - 1) / block_size; + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_lookback_scan_state_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + scan_state, + number_of_blocks, + ordered_bid); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR( + "init_lookback_scan_state_kernel", size, start) + + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + grid_size = number_of_blocks; + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + lookback_scan_kernel), + dim3(grid_size), + dim3(block_size), + 0, + stream, + input, + output, + size, + static_cast(initial_value), + scan_op, + scan_state, + number_of_blocks, + ordered_bid); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("lookback_scan_kernel", size, start) + } + else + { + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(single_scan_kernel), + dim3(1), + dim3(block_size), + 0, + stream, + input, + size, + static_cast(initial_value), + output, + scan_op); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start); + } + return hipSuccess; } - return hipSuccess; -} #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR #undef ROCPRIM_DETAIL_HIP_SYNC @@ -484,39 +497,39 @@ auto scan_impl(void * temporary_storage, /// // output: [1, 3, 6, 10, 15, 21, 28, 36] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t inclusive_scan(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const size_t size, - BinaryFunction scan_op = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t inclusive_scan(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const size_t size, + BinaryFunction scan_op = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + using result_type = + typename ::rocprim::detail::match_result_type::type; // Get default config if Config is default_config using config = detail::default_or_custom_config< Config, - detail::default_scan_config - >; - - return detail::scan_impl( - temporary_storage, storage_size, - // result_type() is a dummy initial value (not used) - input, output, result_type(), size, - scan_op, stream, debug_synchronous - ); + detail::default_scan_config>; + + return detail::scan_impl(temporary_storage, + storage_size, + // result_type() is a dummy initial value (not used) + input, + output, + result_type(), + size, + scan_op, + stream, + debug_synchronous); } /// \brief Parallel exclusive scan primitive for device level. @@ -603,40 +616,40 @@ hipError_t inclusive_scan(void * temporary_storage, /// // output: [9, 4, 7, 6, 2, 2, 1, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class InitValueType, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t exclusive_scan(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - const InitValueType initial_value, - const size_t size, - BinaryFunction scan_op = BinaryFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t exclusive_scan(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + const InitValueType initial_value, + const size_t size, + BinaryFunction scan_op = BinaryFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + using result_type = + typename ::rocprim::detail::match_result_type::type; // Get default config if Config is default_config using config = detail::default_or_custom_config< Config, - detail::default_scan_config - >; - - return detail::scan_impl( - temporary_storage, storage_size, - input, output, initial_value, size, - scan_op, stream, debug_synchronous - ); + detail::default_scan_config>; + + return detail::scan_impl(temporary_storage, + storage_size, + input, + output, + initial_value, + size, + scan_op, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_scan_by_key.hpp b/rocprim/include/rocprim/device/device_scan_by_key.hpp index b9cee7231..fb71795a9 100644 --- a/rocprim/include/rocprim/device/device_scan_by_key.hpp +++ b/rocprim/include/rocprim/device/device_scan_by_key.hpp @@ -21,16 +21,16 @@ #ifndef ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_ #define ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_ -#include #include +#include #include "../config.hpp" -#include "../iterator/zip_iterator.hpp" #include "../iterator/discard_iterator.hpp" +#include "../iterator/zip_iterator.hpp" #include "../types/tuple.hpp" -#include "../detail/various.hpp" #include "../detail/binary_op_wrappers.hpp" +#include "../detail/various.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -127,60 +127,53 @@ BEGIN_ROCPRIM_NAMESPACE /// // values_output: [1, 2, 3, 7, 5, 11, 18, 8] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class BinaryFunction = ::rocprim::plus::value_type>, - class KeyCompareFunction = ::rocprim::equal_to::value_type> -> -inline -hipError_t inclusive_scan_by_key(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const size_t size, - BinaryFunction scan_op = BinaryFunction(), - KeyCompareFunction key_compare_op = KeyCompareFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>, + class KeyCompareFunction + = ::rocprim::equal_to::value_type>> +inline hipError_t inclusive_scan_by_key(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const size_t size, + BinaryFunction scan_op = BinaryFunction(), + KeyCompareFunction key_compare_op = KeyCompareFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + using result_type = + typename ::rocprim::detail::match_result_type::type; using flag_type = bool; - using headflag_scan_op_wrapper_type = - detail::headflag_scan_op_wrapper< - result_type, flag_type, BinaryFunction - >; + using headflag_scan_op_wrapper_type + = detail::headflag_scan_op_wrapper; // Flag the first item of each segment as its head, // then run inclusive scan return inclusive_scan( - temporary_storage, storage_size, + temporary_storage, + storage_size, rocprim::make_transform_iterator( rocprim::make_counting_iterator(0), - [values_input, keys_input, key_compare_op] - ROCPRIM_DEVICE - (const size_t i) - { + [values_input, keys_input, key_compare_op] ROCPRIM_DEVICE(const size_t i) { flag_type flag(true); if(i > 0) { flag = flag_type(!key_compare_op(keys_input[i - 1], keys_input[i])); } return rocprim::make_tuple(values_input[i], flag); - } - ), - rocprim::make_zip_iterator(rocprim::make_tuple(values_output, rocprim::make_discard_iterator())), + }), + rocprim::make_zip_iterator( + rocprim::make_tuple(values_output, rocprim::make_discard_iterator())), size, headflag_scan_op_wrapper_type(scan_op), stream, - debug_synchronous - ); + debug_synchronous); } /// \brief Parallel exclusive scan-by-key primitive for device level. @@ -276,69 +269,68 @@ hipError_t inclusive_scan_by_key(void * temporary_storage, /// // values_output: [9, 10, 12, 9, 13, 9, 15, 9] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class InitialValueType, - class BinaryFunction = ::rocprim::plus::value_type>, - class KeyCompareFunction = ::rocprim::equal_to::value_type> -> -inline -hipError_t exclusive_scan_by_key(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const InitialValueType initial_value, - const size_t size, - BinaryFunction scan_op = BinaryFunction(), - KeyCompareFunction key_compare_op = KeyCompareFunction(), - const hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>, + class KeyCompareFunction + = ::rocprim::equal_to::value_type>> +inline hipError_t exclusive_scan_by_key(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const InitialValueType initial_value, + const size_t size, + BinaryFunction scan_op = BinaryFunction(), + KeyCompareFunction key_compare_op = KeyCompareFunction(), + const hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + using result_type = + typename ::rocprim::detail::match_result_type::type; using flag_type = bool; - using headflag_scan_op_wrapper_type = - detail::headflag_scan_op_wrapper< - result_type, flag_type, BinaryFunction - >; + using headflag_scan_op_wrapper_type + = detail::headflag_scan_op_wrapper; const result_type initial_value_converted = static_cast(initial_value); // Flag the last item of each segment as the next segment's head, use initial_value as its value, // then run exclusive scan return exclusive_scan( - temporary_storage, storage_size, - rocprim::make_transform_iterator( - rocprim::make_counting_iterator(0), - [values_input, keys_input, key_compare_op, initial_value_converted, size] - ROCPRIM_HOST_DEVICE (const size_t i) - { - flag_type flag(false); - if(i + 1 < size) - { - flag = flag_type(!key_compare_op(keys_input[i], keys_input[i + 1])); - } - result_type value = initial_value_converted; - if(!flag) - { - value = values_input[i]; - } - return rocprim::make_tuple(value, flag); - } - ), - rocprim::make_zip_iterator(rocprim::make_tuple(values_output, rocprim::make_discard_iterator())), - rocprim::make_tuple(initial_value_converted, flag_type(true)), // init value is a head of the first segment + temporary_storage, + storage_size, + rocprim::make_transform_iterator(rocprim::make_counting_iterator(0), + [values_input, + keys_input, + key_compare_op, + initial_value_converted, + size] ROCPRIM_HOST_DEVICE(const size_t i) { + flag_type flag(false); + if(i + 1 < size) + { + flag = flag_type(!key_compare_op( + keys_input[i], keys_input[i + 1])); + } + result_type value = initial_value_converted; + if(!flag) + { + value = values_input[i]; + } + return rocprim::make_tuple(value, flag); + }), + rocprim::make_zip_iterator( + rocprim::make_tuple(values_output, rocprim::make_discard_iterator())), + rocprim::make_tuple(initial_value_converted, + flag_type(true)), // init value is a head of the first segment size, headflag_scan_op_wrapper_type(scan_op), stream, - debug_synchronous - ); + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_scan_config.hpp b/rocprim/include/rocprim/device/device_scan_config.hpp index 126064732..cda625aeb 100644 --- a/rocprim/include/rocprim/device/device_scan_config.hpp +++ b/rocprim/include/rocprim/device/device_scan_config.hpp @@ -27,8 +27,8 @@ #include "../detail/various.hpp" #include "../block/block_load.hpp" -#include "../block/block_store.hpp" #include "../block/block_scan.hpp" +#include "../block/block_store.hpp" #include "config_types.hpp" @@ -45,14 +45,12 @@ BEGIN_ROCPRIM_NAMESPACE /// \tparam BlockLoadMethod - method for loading input values. /// \tparam StoreLoadMethod - method for storing values. /// \tparam BlockScanMethod - algorithm for block scan. -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool UseLookback, - ::rocprim::block_load_method BlockLoadMethod, - ::rocprim::block_store_method BlockStoreMethod, - ::rocprim::block_scan_algorithm BlockScanMethod -> +template struct scan_config { /// \brief Number of threads in a block. @@ -72,46 +70,41 @@ struct scan_config namespace detail { -template -struct scan_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = scan_config< - 256, - ::rocprim::max(1u, 16u / item_scale), - ROCPRIM_DETAIL_USE_LOOKBACK_SCAN, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - ::rocprim::block_scan_algorithm::using_warp_scan - >; -}; - -template -struct scan_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = scan_config< - 256, - ::rocprim::max(1u, 16u / item_scale), - ROCPRIM_DETAIL_USE_LOOKBACK_SCAN, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - ::rocprim::block_scan_algorithm::using_warp_scan - >; -}; - -template -struct default_scan_config - : select_arch< - TargetArch, - select_arch_case<803, scan_config_803>, - select_arch_case<900, scan_config_900>, - scan_config_900 - > { }; + template + struct scan_config_803 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = scan_config<256, + ::rocprim::max(1u, 16u / item_scale), + ROCPRIM_DETAIL_USE_LOOKBACK_SCAN, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + ::rocprim::block_scan_algorithm::using_warp_scan>; + }; + + template + struct scan_config_900 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = scan_config<256, + ::rocprim::max(1u, 16u / item_scale), + ROCPRIM_DETAIL_USE_LOOKBACK_SCAN, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + ::rocprim::block_scan_algorithm::using_warp_scan>; + }; + + template + struct default_scan_config : select_arch>, + select_arch_case<900, scan_config_900>, + scan_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp index a16a07028..57fbe0604 100644 --- a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp +++ b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp @@ -27,15 +27,15 @@ #include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/radix_sort.hpp" +#include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" -#include "device_segmented_radix_sort_config.hpp" #include "detail/device_segmented_radix_sort.hpp" +#include "device_segmented_radix_sort_config.hpp" /// \addtogroup devicemodule /// @{ @@ -45,159 +45,176 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class Config, - bool Descending, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class OffsetIterator -> -__global__ -void segmented_sort_kernel(KeysInputIterator keys_input, - typename std::iterator_traits::value_type * keys_tmp, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - typename std::iterator_traits::value_type * values_tmp, - ValuesOutputIterator values_output, - bool to_output, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int long_iterations, - unsigned int short_iterations, - unsigned int begin_bit, - unsigned int end_bit) -{ - segmented_sort( - keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, - to_output, - begin_offsets, end_offsets, - long_iterations, short_iterations, - begin_bit, end_bit - ); -} + template + __global__ void segmented_sort_kernel( + KeysInputIterator keys_input, + typename std::iterator_traits::value_type* keys_tmp, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + typename std::iterator_traits::value_type* values_tmp, + ValuesOutputIterator values_output, + bool to_output, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int long_iterations, + unsigned int short_iterations, + unsigned int begin_bit, + unsigned int end_bit) + { + segmented_sort(keys_input, + keys_tmp, + keys_output, + values_input, + values_tmp, + values_output, + to_output, + begin_offsets, + end_offsets, + long_iterations, + short_iterations, + begin_bit, + end_bit); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - class Config, - bool Descending, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class OffsetIterator -> -inline -hipError_t segmented_radix_sort_impl(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - typename std::iterator_traits::value_type * keys_tmp, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - typename std::iterator_traits::value_type * values_tmp, - ValuesOutputIterator values_output, - unsigned int size, - bool& is_result_in_output, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit, - unsigned int end_bit, - hipStream_t stream, - bool debug_synchronous) -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; + template + inline hipError_t segmented_radix_sort_impl( + void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + typename std::iterator_traits::value_type* keys_tmp, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + typename std::iterator_traits::value_type* values_tmp, + ValuesOutputIterator values_output, + unsigned int size, + bool& is_result_in_output, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit, + unsigned int end_bit, + hipStream_t stream, + bool debug_synchronous) + { + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; - static_assert( - std::is_same::value_type>::value, - "KeysInputIterator and KeysOutputIterator must have the same value_type" - ); - static_assert( - std::is_same::value_type>::value, - "ValuesInputIterator and ValuesOutputIterator must have the same value_type" - ); + static_assert( + std::is_same::value_type>::value, + "KeysInputIterator and KeysOutputIterator must have the same value_type"); + static_assert( + std::is_same::value_type>::value, + "ValuesInputIterator and ValuesOutputIterator must have the same value_type"); - using config = default_or_custom_config< - Config, - default_segmented_radix_sort_config - >; + using config = default_or_custom_config< + Config, + default_segmented_radix_sort_config>; - constexpr bool with_values = !std::is_same::value; + constexpr bool with_values = !std::is_same::value; - const bool with_double_buffer = keys_tmp != nullptr; + const bool with_double_buffer = keys_tmp != nullptr; - const unsigned int bits = end_bit - begin_bit; - const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits); - const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits; - const unsigned int short_iterations = radix_bits_diff != 0 - ? ::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff) - : 0; - const unsigned int long_iterations = iterations - short_iterations; + const unsigned int bits = end_bit - begin_bit; + const unsigned int iterations + = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits); + const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits; + const unsigned int short_iterations + = radix_bits_diff != 0 ? ::rocprim::min( + iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff) + : 0; + const unsigned int long_iterations = iterations - short_iterations; - const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type)); - const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0; - if(temporary_storage == nullptr) - { - storage_size = with_double_buffer ? 0 : (keys_bytes + values_bytes); - // Make sure user won't try to allocate 0 bytes memory, otherwise - // user may again pass nullptr as temporary_storage - storage_size = storage_size == 0 ? 4 : storage_size; - return hipSuccess; - } + const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type)); + const size_t values_bytes + = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0; + if(temporary_storage == nullptr) + { + storage_size = with_double_buffer ? 0 : (keys_bytes + values_bytes); + // Make sure user won't try to allocate 0 bytes memory, otherwise + // user may again pass nullptr as temporary_storage + storage_size = storage_size == 0 ? 4 : storage_size; + return hipSuccess; + } - if(debug_synchronous) - { - std::cout << "iterations " << iterations << '\n'; - std::cout << "long_iterations " << long_iterations << '\n'; - std::cout << "short_iterations " << short_iterations << '\n'; - hipError_t error = hipStreamSynchronize(stream); - if(error != hipSuccess) return error; - } + if(debug_synchronous) + { + std::cout << "iterations " << iterations << '\n'; + std::cout << "long_iterations " << long_iterations << '\n'; + std::cout << "short_iterations " << short_iterations << '\n'; + hipError_t error = hipStreamSynchronize(stream); + if(error != hipSuccess) + return error; + } - char * ptr = reinterpret_cast(temporary_storage); - if(!with_double_buffer) - { - keys_tmp = reinterpret_cast(ptr); - ptr += keys_bytes; - values_tmp = with_values ? reinterpret_cast(ptr) : nullptr; - } + char* ptr = reinterpret_cast(temporary_storage); + if(!with_double_buffer) + { + keys_tmp = reinterpret_cast(ptr); + ptr += keys_bytes; + values_tmp = with_values ? reinterpret_cast(ptr) : nullptr; + } - const bool to_output = with_double_buffer || (iterations - 1) % 2 == 0; + const bool to_output = with_double_buffer || (iterations - 1) % 2 == 0; - std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(segmented_sort_kernel), - dim3(segments), dim3(config::sort::block_size), 0, stream, - keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, - to_output, - begin_offsets, end_offsets, - long_iterations, short_iterations, - begin_bit, end_bit - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort", segments, start) + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_sort_kernel), + dim3(segments), + dim3(config::sort::block_size), + 0, + stream, + keys_input, + keys_tmp, + keys_output, + values_input, + values_tmp, + values_output, + to_output, + begin_offsets, + end_offsets, + long_iterations, + short_iterations, + begin_bit, + end_bit); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort", segments, start) - is_result_in_output = ((iterations % 2 == 0) != to_output); + is_result_in_output = ((iterations % 2 == 0) != to_output); - return hipSuccess; -} + return hipSuccess; + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR @@ -290,38 +307,43 @@ hipError_t segmented_radix_sort_impl(void * temporary_storage, /// // keys_output: [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class OffsetIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t segmented_radix_sort_keys(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t segmented_radix_sort_keys(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool ignored; - return detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values, nullptr, values, - size, ignored, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool ignored; + return detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values, + nullptr, + values, + size, + ignored, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel descending radix sort primitive for device level. @@ -411,38 +433,43 @@ hipError_t segmented_radix_sort_keys(void * temporary_storage, /// // keys_output: [6, 3, 5, 8, 7, 4, 2, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class OffsetIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t segmented_radix_sort_keys_desc(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t segmented_radix_sort_keys_desc(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool ignored; - return detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values, nullptr, values, - size, ignored, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool ignored; + return detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values, + nullptr, + values, + size, + ignored, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel ascending radix sort-by-key primitive for device level. @@ -548,41 +575,46 @@ hipError_t segmented_radix_sort_keys_desc(void * temporary_storage, /// // values_output: [2, -5, -4, -1, -2, 3, 7, -8] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class OffsetIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t segmented_radix_sort_pairs(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t segmented_radix_sort_pairs(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { bool ignored; - return detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values_input, nullptr, values_output, - size, ignored, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + return detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values_input, + nullptr, + values_output, + size, + ignored, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel descending radix sort-by-key primitive for device level. @@ -684,41 +716,46 @@ hipError_t segmented_radix_sort_pairs(void * temporary_storage, /// // values_output: [-5, 2, -4, -8, 7, 3, -1, -2] /// \endcode /// \endparblock -template< - class Config = default_config, - class KeysInputIterator, - class KeysOutputIterator, - class ValuesInputIterator, - class ValuesOutputIterator, - class OffsetIterator, - class Key = typename std::iterator_traits::value_type -> -inline -hipError_t segmented_radix_sort_pairs_desc(void * temporary_storage, - size_t& storage_size, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type> +inline hipError_t segmented_radix_sort_pairs_desc(void* temporary_storage, + size_t& storage_size, + KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { bool ignored; - return detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys_input, nullptr, keys_output, - values_input, nullptr, values_output, - size, ignored, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + return detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys_input, + nullptr, + keys_output, + values_input, + nullptr, + values_output, + size, + ignored, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); } /// \brief Parallel ascending radix sort primitive for device level. @@ -812,35 +849,38 @@ hipError_t segmented_radix_sort_pairs_desc(void * temporary_storage, /// // keys.current(): [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key, - class OffsetIterator -> -inline -hipError_t segmented_radix_sort_keys(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t segmented_radix_sort_keys(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool is_result_in_output; - hipError_t error = detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values, values, values, - size, is_result_in_output, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool is_result_in_output; + hipError_t error = detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values, + values, + values, + size, + is_result_in_output, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); @@ -939,35 +979,38 @@ hipError_t segmented_radix_sort_keys(void * temporary_storage, /// // keys.current(): [6, 3, 5, 8, 7, 4, 2, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key, - class OffsetIterator -> -inline -hipError_t segmented_radix_sort_keys_desc(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t segmented_radix_sort_keys_desc(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - empty_type * values = nullptr; - bool is_result_in_output; - hipError_t error = detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values, values, values, - size, is_result_in_output, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + empty_type* values = nullptr; + bool is_result_in_output; + hipError_t error = detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values, + values, + values, + size, + is_result_in_output, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); @@ -1079,36 +1122,38 @@ hipError_t segmented_radix_sort_keys_desc(void * temporary_storage, /// // values.current(): [2, -5, -4, -1, -2, 3, 7, -8] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key, - class Value, - class OffsetIterator -> -inline -hipError_t segmented_radix_sort_pairs(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - double_buffer& values, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t segmented_radix_sort_pairs(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + double_buffer& values, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - bool is_result_in_output; - hipError_t error = detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values.current(), values.current(), values.alternate(), - size, is_result_in_output, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + bool is_result_in_output; + hipError_t error = detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values.current(), + values.current(), + values.alternate(), + size, + is_result_in_output, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); @@ -1215,36 +1260,38 @@ hipError_t segmented_radix_sort_pairs(void * temporary_storage, /// // values.current(): [-5, 2, -4, -8, 7, 3, -1, -2] /// \endcode /// \endparblock -template< - class Config = default_config, - class Key, - class Value, - class OffsetIterator -> -inline -hipError_t segmented_radix_sort_pairs_desc(void * temporary_storage, - size_t& storage_size, - double_buffer& keys, - double_buffer& values, - unsigned int size, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - unsigned int begin_bit = 0, - unsigned int end_bit = 8 * sizeof(Key), - hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t segmented_radix_sort_pairs_desc(void* temporary_storage, + size_t& storage_size, + double_buffer& keys, + double_buffer& values, + unsigned int size, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + unsigned int begin_bit = 0, + unsigned int end_bit = 8 * sizeof(Key), + hipStream_t stream = 0, + bool debug_synchronous = false) { - bool is_result_in_output; - hipError_t error = detail::segmented_radix_sort_impl( - temporary_storage, storage_size, - keys.current(), keys.current(), keys.alternate(), - values.current(), values.current(), values.alternate(), - size, is_result_in_output, - segments, begin_offsets, end_offsets, - begin_bit, end_bit, - stream, debug_synchronous - ); + bool is_result_in_output; + hipError_t error = detail::segmented_radix_sort_impl(temporary_storage, + storage_size, + keys.current(), + keys.current(), + keys.alternate(), + values.current(), + values.current(), + values.alternate(), + size, + is_result_in_output, + segments, + begin_offsets, + end_offsets, + begin_bit, + end_bit, + stream, + debug_synchronous); if(temporary_storage != nullptr && is_result_in_output) { keys.swap(); diff --git a/rocprim/include/rocprim/device/device_segmented_radix_sort_config.hpp b/rocprim/include/rocprim/device/device_segmented_radix_sort_config.hpp index 696a8c5cc..1d23e80ea 100644 --- a/rocprim/include/rocprim/device/device_segmented_radix_sort_config.hpp +++ b/rocprim/include/rocprim/device/device_segmented_radix_sort_config.hpp @@ -45,11 +45,7 @@ BEGIN_ROCPRIM_NAMESPACE /// \tparam LongRadixBits - number of bits in long iterations. /// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits. /// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config. -template< - unsigned int LongRadixBits, - unsigned int ShortRadixBits, - class SortConfig -> +template struct segmented_radix_sort_config { /// \brief Number of bits in long iterations. @@ -63,86 +59,80 @@ struct segmented_radix_sort_config namespace detail { -template -struct segmented_radix_sort_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); - - using type = select_type< - select_type_case< - (sizeof(Key) == 1 && sizeof(Value) <= 8), - segmented_radix_sort_config<8, 7, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 2 && sizeof(Value) <= 8), - segmented_radix_sort_config<8, 7, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 4 && sizeof(Value) <= 8), - segmented_radix_sort_config<7, 6, kernel_config<256, 15> > - >, - select_type_case< - (sizeof(Key) == 8 && sizeof(Value) <= 8), - segmented_radix_sort_config<7, 6, kernel_config<256, 13> > - >, - segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)> > - >; -}; - -template -struct segmented_radix_sort_config_803 - : select_type< - select_type_case > >, - select_type_case > >, - select_type_case > >, - select_type_case > > - > { }; - -template -struct segmented_radix_sort_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); - - using type = select_type< - select_type_case< - (sizeof(Key) == 1 && sizeof(Value) <= 8), - segmented_radix_sort_config<4, 4, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 2 && sizeof(Value) <= 8), - segmented_radix_sort_config<6, 5, kernel_config<256, 10> > - >, - select_type_case< - (sizeof(Key) == 4 && sizeof(Value) <= 8), - segmented_radix_sort_config<7, 6, kernel_config<256, 15> > - >, - select_type_case< - (sizeof(Key) == 8 && sizeof(Value) <= 8), - segmented_radix_sort_config<7, 6, kernel_config<256, 15> > - >, - segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)> > - >; -}; - -template -struct segmented_radix_sort_config_900 - : select_type< - select_type_case > >, - select_type_case > >, - select_type_case > >, - select_type_case > > - > { }; - -template -struct default_segmented_radix_sort_config - : select_arch< - TargetArch, - select_arch_case<803, detail::segmented_radix_sort_config_803 >, - select_arch_case<900, detail::segmented_radix_sort_config_900 >, - detail::segmented_radix_sort_config_900 - > { }; + template + struct segmented_radix_sort_config_803 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + ::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); + + using type = select_type< + select_type_case<(sizeof(Key) == 1 && sizeof(Value) <= 8), + segmented_radix_sort_config<8, 7, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 2 && sizeof(Value) <= 8), + segmented_radix_sort_config<8, 7, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 4 && sizeof(Value) <= 8), + segmented_radix_sort_config<7, 6, kernel_config<256, 15>>>, + select_type_case<(sizeof(Key) == 8 && sizeof(Value) <= 8), + segmented_radix_sort_config<7, 6, kernel_config<256, 13>>>, + segmented_radix_sort_config<7, + 6, + kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>>>; + }; + + template + struct segmented_radix_sort_config_803 + : select_type>>, + select_type_case>>, + select_type_case>>, + select_type_case>>> + { + }; + + template + struct segmented_radix_sort_config_900 + { + static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( + ::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); + + using type = select_type< + select_type_case<(sizeof(Key) == 1 && sizeof(Value) <= 8), + segmented_radix_sort_config<4, 4, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 2 && sizeof(Value) <= 8), + segmented_radix_sort_config<6, 5, kernel_config<256, 10>>>, + select_type_case<(sizeof(Key) == 4 && sizeof(Value) <= 8), + segmented_radix_sort_config<7, 6, kernel_config<256, 15>>>, + select_type_case<(sizeof(Key) == 8 && sizeof(Value) <= 8), + segmented_radix_sort_config<7, 6, kernel_config<256, 15>>>, + segmented_radix_sort_config<7, + 6, + kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>>>; + }; + + template + struct segmented_radix_sort_config_900 + : select_type>>, + select_type_case>>, + select_type_case>>, + select_type_case>>> + { + }; + + template + struct default_segmented_radix_sort_config + : select_arch>, + select_arch_case<900, detail::segmented_radix_sort_config_900>, + detail::segmented_radix_sort_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_segmented_reduce.hpp b/rocprim/include/rocprim/device/device_segmented_reduce.hpp index 7ae1aee6e..98c5bed02 100644 --- a/rocprim/include/rocprim/device/device_segmented_reduce.hpp +++ b/rocprim/include/rocprim/device/device_segmented_reduce.hpp @@ -21,13 +21,13 @@ #ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_ #define ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_ -#include #include +#include #include "../config.hpp" -#include "../functional.hpp" -#include "../detail/various.hpp" #include "../detail/match_result_type.hpp" +#include "../detail/various.hpp" +#include "../functional.hpp" #include "detail/device_segmented_reduce.hpp" @@ -39,100 +39,96 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class Config, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class ResultType, - class BinaryFunction -> -__global__ -void segmented_reduce_kernel(InputIterator input, - OutputIterator output, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - BinaryFunction reduce_op, - ResultType initial_value) -{ - segmented_reduce( - input, output, - begin_offsets, end_offsets, - reduce_op, initial_value - ); -} + template + __global__ void segmented_reduce_kernel(InputIterator input, + OutputIterator output, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + BinaryFunction reduce_op, + ResultType initial_value) + { + segmented_reduce( + input, output, begin_offsets, end_offsets, reduce_op, initial_value); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - class Config, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class InitValueType, - class BinaryFunction -> -inline -hipError_t segmented_reduce_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - BinaryFunction reduce_op, - InitValueType initial_value, - hipStream_t stream, - bool debug_synchronous) -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + template + inline hipError_t segmented_reduce_impl(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + BinaryFunction reduce_op, + InitValueType initial_value, + hipStream_t stream, + bool debug_synchronous) + { + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; - // Get default config if Config is default_config - using config = default_or_custom_config< - Config, - default_reduce_config - >; + // Get default config if Config is default_config + using config + = default_or_custom_config>; - constexpr unsigned int block_size = config::block_size; + constexpr unsigned int block_size = config::block_size; - if(temporary_storage == nullptr) - { - // Make sure user won't try to allocate 0 bytes memory, because - // hipMalloc will return nullptr when size is zero. - storage_size = 4; - return hipSuccess; - } + if(temporary_storage == nullptr) + { + // Make sure user won't try to allocate 0 bytes memory, because + // hipMalloc will return nullptr when size is zero. + storage_size = 4; + return hipSuccess; + } - std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(segmented_reduce_kernel), - dim3(segments), dim3(block_size), 0, stream, - input, output, - begin_offsets, end_offsets, - reduce_op, static_cast(initial_value) - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_reduce", segments, start); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_reduce_kernel), + dim3(segments), + dim3(block_size), + 0, + stream, + input, + output, + begin_offsets, + end_offsets, + reduce_op, + static_cast(initial_value)); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_reduce", segments, start); - return hipSuccess; -} + return hipSuccess; + } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR @@ -232,34 +228,36 @@ hipError_t segmented_reduce_impl(void * temporary_storage, /// // output: [4, 6, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class BinaryFunction = ::rocprim::plus::value_type>, - class InitValueType = typename std::iterator_traits::value_type -> -inline -hipError_t segmented_reduce(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - BinaryFunction reduce_op = BinaryFunction(), - InitValueType initial_value = InitValueType(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>, + class InitValueType = typename std::iterator_traits::value_type> +inline hipError_t segmented_reduce(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + BinaryFunction reduce_op = BinaryFunction(), + InitValueType initial_value = InitValueType(), + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::segmented_reduce_impl( - temporary_storage, storage_size, - input, output, - segments, begin_offsets, end_offsets, - reduce_op, initial_value, - stream, debug_synchronous - ); + return detail::segmented_reduce_impl(temporary_storage, + storage_size, + input, + output, + segments, + begin_offsets, + end_offsets, + reduce_op, + initial_value, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_segmented_scan.hpp b/rocprim/include/rocprim/device/device_segmented_scan.hpp index 7c3a9e967..b1417129a 100644 --- a/rocprim/include/rocprim/device/device_segmented_scan.hpp +++ b/rocprim/include/rocprim/device/device_segmented_scan.hpp @@ -21,20 +21,20 @@ #ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_ #define ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_ -#include #include +#include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/match_result_type.hpp" +#include "../detail/various.hpp" -#include "../iterator/zip_iterator.hpp" #include "../iterator/discard_iterator.hpp" #include "../iterator/transform_iterator.hpp" +#include "../iterator/zip_iterator.hpp" #include "../types/tuple.hpp" -#include "device_scan_config.hpp" #include "detail/device_segmented_scan.hpp" +#include "device_scan_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -44,101 +44,102 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - bool Exclusive, - class Config, - class ResultType, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class InitValueType, - class BinaryFunction -> -__global__ -void segmented_scan_kernel(InputIterator input, - OutputIterator output, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - InitValueType initial_value, - BinaryFunction scan_op) -{ - segmented_scan( - input, output, begin_offsets, end_offsets, - static_cast(initial_value), scan_op - ); -} + template + __global__ void segmented_scan_kernel(InputIterator input, + OutputIterator output, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + InitValueType initial_value, + BinaryFunction scan_op) + { + segmented_scan(input, + output, + begin_offsets, + end_offsets, + static_cast(initial_value), + scan_op); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } -template< - bool Exclusive, - class Config, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class InitValueType, - class BinaryFunction -> -inline -hipError_t segmented_scan_impl(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - const InitValueType initial_value, - BinaryFunction scan_op, - hipStream_t stream, - bool debug_synchronous) -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + template + inline hipError_t segmented_scan_impl(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + const InitValueType initial_value, + BinaryFunction scan_op, + hipStream_t stream, + bool debug_synchronous) + { + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; - // Get default config if Config is default_config - using config = default_or_custom_config< - Config, - default_scan_config - >; + // Get default config if Config is default_config + using config + = default_or_custom_config>; - constexpr unsigned int block_size = config::block_size; + constexpr unsigned int block_size = config::block_size; - if(temporary_storage == nullptr) - { - // Make sure user won't try to allocate 0 bytes memory, because - // hipMalloc will return nullptr when size is zero. - storage_size = 4; + if(temporary_storage == nullptr) + { + // Make sure user won't try to allocate 0 bytes memory, because + // hipMalloc will return nullptr when size is zero. + storage_size = 4; + return hipSuccess; + } + + std::chrono::high_resolution_clock::time_point start; + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(segmented_scan_kernel), + dim3(segments), + dim3(block_size), + 0, + stream, + input, + output, + begin_offsets, + end_offsets, + initial_value, + scan_op); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_scan", segments, start); return hipSuccess; } - std::chrono::high_resolution_clock::time_point start; - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(segmented_scan_kernel), - dim3(segments), dim3(block_size), 0, stream, - input, output, - begin_offsets, end_offsets, - initial_value, scan_op - ); - ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_scan", segments, start); - return hipSuccess; -} - #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR } // end of detail namespace @@ -229,35 +230,38 @@ hipError_t segmented_scan_impl(void * temporary_storage, /// // output: [4, 4, 6, 2, 5, 1, 1, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t segmented_inclusive_scan(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - BinaryFunction scan_op = BinaryFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t segmented_inclusive_scan(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + BinaryFunction scan_op = BinaryFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + using result_type = + typename ::rocprim::detail::match_result_type::type; - return detail::segmented_scan_impl( - temporary_storage, storage_size, - input, output, segments, begin_offsets, end_offsets, result_type(), - scan_op, stream, debug_synchronous - ); + return detail::segmented_scan_impl(temporary_storage, + storage_size, + input, + output, + segments, + begin_offsets, + end_offsets, + result_type(), + scan_op, + stream, + debug_synchronous); } /// \brief Parallel segmented exclusive scan primitive for device level. @@ -351,32 +355,36 @@ hipError_t segmented_inclusive_scan(void * temporary_storage, /// // output: [9, 4, 9, 6, 9, 5, 1, 1] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class OffsetIterator, - class InitValueType, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t segmented_exclusive_scan(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - unsigned int segments, - OffsetIterator begin_offsets, - OffsetIterator end_offsets, - const InitValueType initial_value, - BinaryFunction scan_op = BinaryFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t segmented_exclusive_scan(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + unsigned int segments, + OffsetIterator begin_offsets, + OffsetIterator end_offsets, + const InitValueType initial_value, + BinaryFunction scan_op = BinaryFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { - return detail::segmented_scan_impl( - temporary_storage, storage_size, - input, output, segments, begin_offsets, end_offsets, initial_value, - scan_op, stream, debug_synchronous - ); + return detail::segmented_scan_impl(temporary_storage, + storage_size, + input, + output, + segments, + begin_offsets, + end_offsets, + initial_value, + scan_op, + stream, + debug_synchronous); } /// \brief Parallel segmented inclusive scan primitive for device level. @@ -457,41 +465,38 @@ hipError_t segmented_exclusive_scan(void * temporary_storage, /// // output: [1, 3, 6, 4, 9, 6, 13, 21] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class HeadFlagIterator, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t segmented_inclusive_scan(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - HeadFlagIterator head_flags, - size_t size, - BinaryFunction scan_op = BinaryFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t segmented_inclusive_scan(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + HeadFlagIterator head_flags, + size_t size, + BinaryFunction scan_op = BinaryFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + using result_type = + typename ::rocprim::detail::match_result_type::type; using flag_type = typename std::iterator_traits::value_type; - using headflag_scan_op_wrapper_type = - detail::headflag_scan_op_wrapper< - result_type, flag_type, BinaryFunction - >; + using headflag_scan_op_wrapper_type + = detail::headflag_scan_op_wrapper; return inclusive_scan( - temporary_storage, storage_size, + temporary_storage, + storage_size, rocprim::make_zip_iterator(rocprim::make_tuple(input, head_flags)), rocprim::make_zip_iterator(rocprim::make_tuple(output, rocprim::make_discard_iterator())), - size, headflag_scan_op_wrapper_type(scan_op), - stream, debug_synchronous - ); + size, + headflag_scan_op_wrapper_type(scan_op), + stream, + debug_synchronous); } /// \brief Parallel segmented exclusive scan primitive for device level. @@ -575,48 +580,41 @@ hipError_t segmented_inclusive_scan(void * temporary_storage, /// // output: [9, 10, 12, 9, 13, 9, 15, 22] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class InitValueType, - class HeadFlagIterator, - class BinaryFunction = ::rocprim::plus::value_type> -> -inline -hipError_t segmented_exclusive_scan(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - HeadFlagIterator head_flags, - const InitValueType initial_value, - size_t size, - BinaryFunction scan_op = BinaryFunction(), - hipStream_t stream = 0, - bool debug_synchronous = false) +template ::value_type>> +inline hipError_t segmented_exclusive_scan(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + HeadFlagIterator head_flags, + const InitValueType initial_value, + size_t size, + BinaryFunction scan_op = BinaryFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) { using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryFunction - >::type; + using result_type = + typename ::rocprim::detail::match_result_type::type; using flag_type = typename std::iterator_traits::value_type; - using headflag_scan_op_wrapper_type = - detail::headflag_scan_op_wrapper< - result_type, flag_type, BinaryFunction - >; + using headflag_scan_op_wrapper_type + = detail::headflag_scan_op_wrapper; const result_type initial_value_converted = static_cast(initial_value); // Flag the last item of each segment as the next segment's head, use initial_value as its value, // then run exclusive scan return exclusive_scan( - temporary_storage, storage_size, + temporary_storage, + storage_size, rocprim::make_transform_iterator( rocprim::make_counting_iterator(0), - [input, head_flags, initial_value_converted, size] - ROCPRIM_DEVICE - (const size_t i) - { + [input, head_flags, initial_value_converted, size] ROCPRIM_DEVICE(const size_t i) { flag_type flag(false); if(i + 1 < size) { @@ -628,15 +626,14 @@ hipError_t segmented_exclusive_scan(void * temporary_storage, value = input[i]; } return rocprim::make_tuple(value, flag); - } - ), + }), rocprim::make_zip_iterator(rocprim::make_tuple(output, rocprim::make_discard_iterator())), - rocprim::make_tuple(initial_value_converted, flag_type(true)), // init value is a head of the first segment + rocprim::make_tuple(initial_value_converted, + flag_type(true)), // init value is a head of the first segment size, headflag_scan_op_wrapper_type(scan_op), stream, - debug_synchronous - ); + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_select.hpp b/rocprim/include/rocprim/device/device_select.hpp index 3ca3080bb..f1b261f27 100644 --- a/rocprim/include/rocprim/device/device_select.hpp +++ b/rocprim/include/rocprim/device/device_select.hpp @@ -21,17 +21,17 @@ #ifndef ROCPRIM_DEVICE_DEVICE_SELECT_HPP_ #define ROCPRIM_DEVICE_DEVICE_SELECT_HPP_ -#include #include +#include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/binary_op_wrappers.hpp" +#include "../detail/various.hpp" #include "../iterator/transform_iterator.hpp" -#include "device_scan.hpp" #include "device_partition.hpp" +#include "device_scan.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -41,18 +41,20 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } } // end detail namespace @@ -136,33 +138,37 @@ namespace detail /// // output_count: 4 /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class FlagIterator, - class OutputIterator, - class SelectedCountOutputIterator -> -inline -hipError_t select(void * temporary_storage, - size_t& storage_size, - InputIterator input, - FlagIterator flags, - OutputIterator output, - SelectedCountOutputIterator selected_count_output, - const size_t size, - const hipStream_t stream = 0, - const bool debug_synchronous = false) +template +inline hipError_t select(void* temporary_storage, + size_t& storage_size, + InputIterator input, + FlagIterator flags, + OutputIterator output, + SelectedCountOutputIterator selected_count_output, + const size_t size, + const hipStream_t stream = 0, + const bool debug_synchronous = false) { // Dummy unary predicate using unary_predicate_type = ::rocprim::empty_type; // Dummy inequality operation using inequality_op_type = ::rocprim::empty_type; - return detail::partition_impl( - temporary_storage, storage_size, input, flags, output, selected_count_output, - size, unary_predicate_type(), inequality_op_type(), stream, debug_synchronous - ); + return detail::partition_impl(temporary_storage, + storage_size, + input, + flags, + output, + selected_count_output, + size, + unary_predicate_type(), + inequality_op_type(), + stream, + debug_synchronous); } /// \brief Parallel select primitive for device level using selection operator. @@ -247,34 +253,39 @@ hipError_t select(void * temporary_storage, /// // output_count: 4 /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class SelectedCountOutputIterator, - class UnaryPredicate -> -inline -hipError_t select(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - SelectedCountOutputIterator selected_count_output, - const size_t size, - UnaryPredicate predicate, - const hipStream_t stream = 0, - const bool debug_synchronous = false) +template +inline hipError_t select(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + SelectedCountOutputIterator selected_count_output, + const size_t size, + UnaryPredicate predicate, + const hipStream_t stream = 0, + const bool debug_synchronous = false) { // Dummy flag type - using flag_type = ::rocprim::empty_type; - flag_type * flags = nullptr; + using flag_type = ::rocprim::empty_type; + flag_type* flags = nullptr; // Dummy inequality operation using inequality_op_type = ::rocprim::empty_type; return detail::partition_impl( - temporary_storage, storage_size, input, flags, output, selected_count_output, - size, predicate, inequality_op_type(), stream, debug_synchronous - ); + temporary_storage, + storage_size, + input, + flags, + output, + selected_count_output, + size, + predicate, + inequality_op_type(), + stream, + debug_synchronous); } /// \brief Device-level parallel unique primitive. @@ -351,38 +362,44 @@ hipError_t select(void * temporary_storage, /// // output_count: 5 /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class UniqueCountOutputIterator, - class EqualityOp = ::rocprim::equal_to::value_type> -> -inline -hipError_t unique(void * temporary_storage, - size_t& storage_size, - InputIterator input, - OutputIterator output, - UniqueCountOutputIterator unique_count_output, - const size_t size, - EqualityOp equality_op = EqualityOp(), - const hipStream_t stream = 0, - const bool debug_synchronous = false) +template ::value_type>> +inline hipError_t unique(void* temporary_storage, + size_t& storage_size, + InputIterator input, + OutputIterator output, + UniqueCountOutputIterator unique_count_output, + const size_t size, + EqualityOp equality_op = EqualityOp(), + const hipStream_t stream = 0, + const bool debug_synchronous = false) { // Dummy unary predicate using unary_predicate_type = ::rocprim::empty_type; // Dummy flag type - using flag_type = ::rocprim::empty_type; - flag_type * flags = nullptr; + using flag_type = ::rocprim::empty_type; + flag_type* flags = nullptr; // Convert equality operator to inequality operator auto inequality_op = detail::inequality_wrapper(equality_op); return detail::partition_impl( - temporary_storage, storage_size, input, flags, output, unique_count_output, - size, unary_predicate_type(), inequality_op, stream, debug_synchronous - ); + temporary_storage, + storage_size, + input, + flags, + output, + unique_count_output, + size, + unary_predicate_type(), + inequality_op, + stream, + debug_synchronous); } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR diff --git a/rocprim/include/rocprim/device/device_select_config.hpp b/rocprim/include/rocprim/device/device_select_config.hpp index 901ede79c..0b3c22c26 100644 --- a/rocprim/include/rocprim/device/device_select_config.hpp +++ b/rocprim/include/rocprim/device/device_select_config.hpp @@ -43,13 +43,11 @@ BEGIN_ROCPRIM_NAMESPACE /// \tparam ValueBlockLoadMethod - method for loading input values. /// \tparam FlagBlockLoadMethod - method for loading flag values. /// \tparam BlockScanMethod - algorithm for block scan. -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - ::rocprim::block_load_method ValueBlockLoadMethod, - ::rocprim::block_load_method FlagBlockLoadMethod, - ::rocprim::block_scan_algorithm BlockScanMethod -> +template struct select_config { /// \brief Number of threads in a block. @@ -67,44 +65,39 @@ struct select_config namespace detail { -template -struct select_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = select_config< - 256, - ::rocprim::max(1u, 13u / item_scale), - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_scan_algorithm::using_warp_scan - >; -}; - -template -struct select_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = select_config< - 256, - ::rocprim::max(1u, 15u / item_scale), - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_scan_algorithm::using_warp_scan - >; -}; - -template -struct default_select_config - : select_arch< - TargetArch, - select_arch_case<803, select_config_803>, - select_arch_case<900, select_config_900>, - select_config_803 - > { }; + template + struct select_config_803 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = select_config<256, + ::rocprim::max(1u, 13u / item_scale), + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_scan_algorithm::using_warp_scan>; + }; + + template + struct select_config_900 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = select_config<256, + ::rocprim::max(1u, 15u / item_scale), + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_scan_algorithm::using_warp_scan>; + }; + + template + struct default_select_config : select_arch>, + select_arch_case<900, select_config_900>, + select_config_803> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/device/device_transform.hpp b/rocprim/include/rocprim/device/device_transform.hpp index bccf3fb78..a7a1ff45d 100644 --- a/rocprim/include/rocprim/device/device_transform.hpp +++ b/rocprim/include/rocprim/device/device_transform.hpp @@ -21,17 +21,17 @@ #ifndef ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_ #define ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_ -#include #include +#include #include "../config.hpp" -#include "../detail/various.hpp" #include "../detail/match_result_type.hpp" -#include "../types/tuple.hpp" +#include "../detail/various.hpp" #include "../iterator/zip_iterator.hpp" +#include "../types/tuple.hpp" -#include "device_transform_config.hpp" #include "detail/device_transform.hpp" +#include "device_transform_config.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -41,38 +41,36 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class ResultType, - class InputIterator, - class OutputIterator, - class UnaryFunction -> -__global__ -void transform_kernel(InputIterator input, - const size_t size, - OutputIterator output, - UnaryFunction transform_op) -{ - transform_kernel_impl( - input, size, output, transform_op - ); -} + template + __global__ void transform_kernel(InputIterator input, + const size_t size, + OutputIterator output, + UnaryFunction transform_op) + { + transform_kernel_impl( + input, size, output, transform_op); + } -#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ - { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ - if(debug_synchronous) \ - { \ - std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ - } \ +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + { \ + auto error = hipPeekAtLastError(); \ + if(error != hipSuccess) \ + return error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + auto error = hipStreamSynchronize(stream); \ + if(error != hipSuccess) \ + return error; \ + auto end = std::chrono::high_resolution_clock::now(); \ + auto d = std::chrono::duration_cast>(end - start); \ + std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + } \ } } // end of detail namespace @@ -131,37 +129,33 @@ void transform_kernel(InputIterator input, /// // output: [6, 7, 8, 9, 10, 11, 12, 13] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator, - class OutputIterator, - class UnaryFunction -> -inline -hipError_t transform(InputIterator input, - OutputIterator output, - const size_t size, - UnaryFunction transform_op, - const hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t transform(InputIterator input, + OutputIterator output, + const size_t size, + UnaryFunction transform_op, + const hipStream_t stream = 0, + bool debug_synchronous = false) { - using input_type = typename std::iterator_traits::value_type; + using input_type = typename std::iterator_traits::value_type; using result_type = typename ::rocprim::detail::invoke_result::type; // Get default config if Config is default_config using config = detail::default_or_custom_config< Config, - detail::default_transform_config - >; + detail::default_transform_config>; - constexpr unsigned int block_size = config::block_size; + constexpr unsigned int block_size = config::block_size; constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; // Start point for time measurements std::chrono::high_resolution_clock::time_point start; - auto number_of_blocks = (size + items_per_block - 1)/items_per_block; + auto number_of_blocks = (size + items_per_block - 1) / items_per_block; if(debug_synchronous) { std::cout << "block_size " << block_size << '\n'; @@ -169,15 +163,22 @@ hipError_t transform(InputIterator input, std::cout << "items_per_block " << items_per_block << '\n'; } - if(debug_synchronous) start = std::chrono::high_resolution_clock::now(); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(detail::transform_kernel< - block_size, items_per_thread, result_type, - InputIterator, OutputIterator, UnaryFunction - >), - dim3(number_of_blocks), dim3(block_size), 0, stream, - input, size, output, transform_op - ); + if(debug_synchronous) + start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(detail::transform_kernel), + dim3(number_of_blocks), + dim3(block_size), + 0, + stream, + input, + size, + output, + transform_op); ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("transform_kernel", size, start); return hipSuccess; @@ -241,29 +242,28 @@ hipError_t transform(InputIterator input, /// // output: [2, 4, 6, 8, 10, 12, 14, 16] /// \endcode /// \endparblock -template< - class Config = default_config, - class InputIterator1, - class InputIterator2, - class OutputIterator, - class BinaryFunction -> -inline -hipError_t transform(InputIterator1 input1, - InputIterator2 input2, - OutputIterator output, - const size_t size, - BinaryFunction transform_op, - const hipStream_t stream = 0, - bool debug_synchronous = false) +template +inline hipError_t transform(InputIterator1 input1, + InputIterator2 input2, + OutputIterator output, + const size_t size, + BinaryFunction transform_op, + const hipStream_t stream = 0, + bool debug_synchronous = false) { using value_type1 = typename std::iterator_traits::value_type; using value_type2 = typename std::iterator_traits::value_type; return transform( - ::rocprim::make_zip_iterator(::rocprim::make_tuple(input1, input2)), output, - size, detail::unpack_binary_op(transform_op), - stream, debug_synchronous - ); + ::rocprim::make_zip_iterator(::rocprim::make_tuple(input1, input2)), + output, + size, + detail::unpack_binary_op(transform_op), + stream, + debug_synchronous); } #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR diff --git a/rocprim/include/rocprim/device/device_transform_config.hpp b/rocprim/include/rocprim/device/device_transform_config.hpp index 5523b5137..ea63a9805 100644 --- a/rocprim/include/rocprim/device/device_transform_config.hpp +++ b/rocprim/include/rocprim/device/device_transform_config.hpp @@ -34,38 +34,38 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Configuration of device-level transform primitives. -template +template using transform_config = kernel_config; namespace detail { -template -struct transform_config_803 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>; -}; - -template -struct transform_config_900 -{ - static constexpr unsigned int item_scale = - ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - - using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>; -}; - -template -struct default_transform_config - : select_arch< - TargetArch, - select_arch_case<803, transform_config_803>, - select_arch_case<900, transform_config_900>, - transform_config_900 - > { }; + template + struct transform_config_803 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>; + }; + + template + struct transform_config_900 + { + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>; + }; + + template + struct default_transform_config + : select_arch>, + select_arch_case<900, transform_config_900>, + transform_config_900> + { + }; } // end namespace detail diff --git a/rocprim/include/rocprim/functional.hpp b/rocprim/include/rocprim/functional.hpp index 4fd622cd3..9cff51631 100644 --- a/rocprim/include/rocprim/functional.hpp +++ b/rocprim/include/rocprim/functional.hpp @@ -31,155 +31,139 @@ BEGIN_ROCPRIM_NAMESPACE /// \addtogroup utilsmodule_functional /// @{ -template -ROCPRIM_HOST_DEVICE inline -constexpr T max(const T& a, const T& b) +template +ROCPRIM_HOST_DEVICE inline constexpr T max(const T& a, const T& b) { return a < b ? b : a; } -template -ROCPRIM_HOST_DEVICE inline -constexpr T min(const T& a, const T& b) +template +ROCPRIM_HOST_DEVICE inline constexpr T min(const T& a, const T& b) { return a < b ? a : b; } -template -ROCPRIM_HOST_DEVICE inline -void swap(T& a, T& b) +template +ROCPRIM_HOST_DEVICE inline void swap(T& a, T& b) { T c = a; - a = b; - b = c; + a = b; + b = c; } -template +template struct less { - ROCPRIM_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr bool operator()(const T& a, const T& b) const { return a < b; } }; -template<> +template <> struct less { - template - ROCPRIM_HOST_DEVICE inline - constexpr bool operator()(const T& a, const U& b) const + template + ROCPRIM_HOST_DEVICE inline constexpr bool operator()(const T& a, const U& b) const { return a < b; } }; -template +template struct less_equal { - ROCPRIM_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr bool operator()(const T& a, const T& b) const { return a <= b; } }; -template +template struct greater { - ROCPRIM_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr bool operator()(const T& a, const T& b) const { return a > b; } }; -template +template struct greater_equal { - ROCPRIM_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr bool operator()(const T& a, const T& b) const { return a >= b; } }; -template +template struct equal_to { - ROCPRIM_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr bool operator()(const T& a, const T& b) const { return a == b; } }; -template +template struct not_equal_to { - ROCPRIM_HOST_DEVICE inline - constexpr bool operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr bool operator()(const T& a, const T& b) const { return a != b; } }; -template +template struct plus { - ROCPRIM_HOST_DEVICE inline - constexpr T operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a + b; } }; -template +template struct minus { - ROCPRIM_HOST_DEVICE inline - constexpr T operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a - b; } }; -template +template struct multiplies { - ROCPRIM_HOST_DEVICE inline - constexpr T operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a * b; } }; -template +template struct maximum { - ROCPRIM_HOST_DEVICE inline - constexpr T operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a < b ? b : a; } }; -template +template struct minimum { - ROCPRIM_HOST_DEVICE inline - constexpr T operator()(const T& a, const T& b) const + ROCPRIM_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a < b ? a : b; } }; -template +template struct identity { - ROCPRIM_HOST_DEVICE inline - constexpr T operator()(const T& a) const + ROCPRIM_HOST_DEVICE inline constexpr T operator()(const T& a) const { return a; } diff --git a/rocprim/include/rocprim/intrinsics/atomic.hpp b/rocprim/include/rocprim/intrinsics/atomic.hpp index b17eca599..f0c5a727f 100644 --- a/rocprim/include/rocprim/intrinsics/atomic.hpp +++ b/rocprim/include/rocprim/intrinsics/atomic.hpp @@ -27,44 +27,39 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { - ROCPRIM_DEVICE inline - unsigned int atomic_add(unsigned int * address, unsigned int value) + ROCPRIM_DEVICE inline unsigned int atomic_add(unsigned int* address, unsigned int value) { return ::atomicAdd(address, value); } - ROCPRIM_DEVICE inline - int atomic_add(int * address, int value) + ROCPRIM_DEVICE inline int atomic_add(int* address, int value) { return ::atomicAdd(address, value); } - ROCPRIM_DEVICE inline - float atomic_add(float * address, float value) + ROCPRIM_DEVICE inline float atomic_add(float* address, float value) { return ::atomicAdd(address, value); } - ROCPRIM_DEVICE inline - unsigned long long atomic_add(unsigned long long * address, unsigned long long value) + ROCPRIM_DEVICE inline unsigned long long atomic_add(unsigned long long* address, + unsigned long long value) { return ::atomicAdd(address, value); } - ROCPRIM_DEVICE inline - unsigned int atomic_wrapinc(unsigned int * address, unsigned int value) + ROCPRIM_DEVICE inline unsigned int atomic_wrapinc(unsigned int* address, unsigned int value) { return ::atomicInc(address, value); } - ROCPRIM_DEVICE inline - unsigned int atomic_exch(unsigned int * address, unsigned int value) + ROCPRIM_DEVICE inline unsigned int atomic_exch(unsigned int* address, unsigned int value) { return ::atomicExch(address, value); } - ROCPRIM_DEVICE inline - unsigned long long atomic_exch(unsigned long long * address, unsigned long long value) + ROCPRIM_DEVICE inline unsigned long long atomic_exch(unsigned long long* address, + unsigned long long value) { return ::atomicExch(address, value); } diff --git a/rocprim/include/rocprim/intrinsics/bit.hpp b/rocprim/include/rocprim/intrinsics/bit.hpp index a7e266979..c1d591c26 100644 --- a/rocprim/include/rocprim/intrinsics/bit.hpp +++ b/rocprim/include/rocprim/intrinsics/bit.hpp @@ -29,8 +29,7 @@ BEGIN_ROCPRIM_NAMESPACE /// @{ /// \brief Returns a single bit at 'i' from 'x' -ROCPRIM_DEVICE inline -int get_bit(int x, int i) +ROCPRIM_DEVICE inline int get_bit(int x, int i) { return (x >> i) & 1; } @@ -38,8 +37,7 @@ int get_bit(int x, int i) /// \brief Bit count /// /// Returns the number of bit of \p x set. -ROCPRIM_DEVICE inline -unsigned int bit_count(unsigned int x) +ROCPRIM_DEVICE inline unsigned int bit_count(unsigned int x) { return __popc(x); } @@ -47,8 +45,7 @@ unsigned int bit_count(unsigned int x) /// \brief Bit count /// /// Returns the number of bit of \p x set. -ROCPRIM_DEVICE inline -unsigned int bit_count(unsigned long long x) +ROCPRIM_DEVICE inline unsigned int bit_count(unsigned long long x) { return __popcll(x); } diff --git a/rocprim/include/rocprim/intrinsics/thread.hpp b/rocprim/include/rocprim/intrinsics/thread.hpp index 3060f5159..98c3a2bd1 100644 --- a/rocprim/include/rocprim/intrinsics/thread.hpp +++ b/rocprim/include/rocprim/intrinsics/thread.hpp @@ -36,22 +36,19 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Returns a number of threads in a hardware warp. /// /// It is constant for a device. -ROCPRIM_HOST_DEVICE inline -constexpr unsigned int warp_size() +ROCPRIM_HOST_DEVICE inline constexpr unsigned int warp_size() { return warpSize; } /// \brief Returns flat size of a multidimensional block (tile). -ROCPRIM_DEVICE inline -unsigned int flat_block_size() +ROCPRIM_DEVICE inline unsigned int flat_block_size() { return hipBlockDim_z * hipBlockDim_y * hipBlockDim_x; } /// \brief Returns flat size of a multidimensional tile (block). -ROCPRIM_DEVICE inline -unsigned int flat_tile_size() +ROCPRIM_DEVICE inline unsigned int flat_tile_size() { return flat_block_size(); } @@ -59,49 +56,41 @@ unsigned int flat_tile_size() // IDs /// \brief Returns thread identifier in a warp. -ROCPRIM_DEVICE inline -unsigned int lane_id() +ROCPRIM_DEVICE inline unsigned int lane_id() { return ::__lane_id(); } /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). -ROCPRIM_DEVICE inline -unsigned int flat_block_thread_id() +ROCPRIM_DEVICE inline unsigned int flat_block_thread_id() { - return (hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) - + (hipThreadIdx_y * hipBlockDim_x) - + hipThreadIdx_x; + return (hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) + (hipThreadIdx_y * hipBlockDim_x) + + hipThreadIdx_x; } /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional tile (block). -ROCPRIM_DEVICE inline -unsigned int flat_tile_thread_id() +ROCPRIM_DEVICE inline unsigned int flat_tile_thread_id() { return flat_block_thread_id(); } /// \brief Returns warp id in a block (tile). -ROCPRIM_DEVICE inline -unsigned int warp_id() +ROCPRIM_DEVICE inline unsigned int warp_id() { - return flat_block_thread_id()/warp_size(); + return flat_block_thread_id() / warp_size(); } /// \brief Returns flat (linear, 1D) block identifier in a multidimensional grid. -ROCPRIM_DEVICE inline -unsigned int flat_block_id() +ROCPRIM_DEVICE inline unsigned int flat_block_id() { - return (hipBlockIdx_z * hipGridDim_y * hipGridDim_x) - + (hipBlockIdx_y * hipGridDim_x) - + hipBlockIdx_x; + return (hipBlockIdx_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x) + + hipBlockIdx_x; } // Sync /// \brief Synchronize all threads in a block (tile) -ROCPRIM_DEVICE inline -void syncthreads() +ROCPRIM_DEVICE inline void syncthreads() { __syncthreads(); } @@ -109,9 +98,8 @@ void syncthreads() namespace detail { /// \brief Returns thread identifier in a multidimensional block (tile) by dimension. - template - ROCPRIM_DEVICE inline - unsigned int block_thread_id() + template + ROCPRIM_DEVICE inline unsigned int block_thread_id() { static_assert(Dim > 2, "Dim must be 0, 1 or 2"); // dummy return, correct values handled by specializations @@ -119,9 +107,8 @@ namespace detail } /// \brief Returns block identifier in a multidimensional grid by dimension. - template - ROCPRIM_DEVICE inline - unsigned int block_id() + template + ROCPRIM_DEVICE inline unsigned int block_id() { static_assert(Dim > 2, "Dim must be 0, 1 or 2"); // dummy return, correct values handled by specializations @@ -129,9 +116,8 @@ namespace detail } /// \brief Returns block size in a multidimensional grid by dimension. - template - ROCPRIM_DEVICE inline - unsigned int block_size() + template + ROCPRIM_DEVICE inline unsigned int block_size() { static_assert(Dim > 2, "Dim must be 0, 1 or 2"); // dummy return, correct values handled by specializations @@ -139,90 +125,80 @@ namespace detail } /// \brief Returns grid size by dimension. - template - ROCPRIM_DEVICE inline - unsigned int grid_size() + template + ROCPRIM_DEVICE inline unsigned int grid_size() { static_assert(Dim > 2, "Dim must be 0, 1 or 2"); // dummy return, correct values handled by specializations return 0; } - #define ROCPRIM_DETAIL_CONCAT(A, B) A ## B - #define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, dim, suffix) \ - template<> \ - ROCPRIM_DEVICE inline \ - unsigned int name() \ - { \ - return ROCPRIM_DETAIL_CONCAT(prefix, suffix); \ - } - #define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(name, prefix) \ - ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 0, x) \ - ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 1, y) \ - ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 2, z) +#define ROCPRIM_DETAIL_CONCAT(A, B) A##B +#define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, dim, suffix) \ + template <> \ + ROCPRIM_DEVICE inline unsigned int name() \ + { \ + return ROCPRIM_DETAIL_CONCAT(prefix, suffix); \ + } +#define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(name, prefix) \ + ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 0, x) \ + ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 1, y) \ + ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 2, z) ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_thread_id, hipThreadIdx_) ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_id, hipBlockIdx_) ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_size, hipBlockDim_) ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(grid_size, hipGridDim_) - #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS - #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC - #undef ROCPRIM_DETAIL_CONCAT +#undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS +#undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC +#undef ROCPRIM_DETAIL_CONCAT // Return thread id in a "logical warp", which can be smaller than a hardware warp size. - template - ROCPRIM_DEVICE inline - auto logical_lane_id() - -> typename std::enable_if::type + template + ROCPRIM_DEVICE inline auto logical_lane_id() -> + typename std::enable_if::type { - return lane_id() & (LogicalWarpSize-1); // same as land_id()%WarpSize + return lane_id() & (LogicalWarpSize - 1); // same as land_id()%WarpSize } - template - ROCPRIM_DEVICE inline - auto logical_lane_id() - -> typename std::enable_if::type + template + ROCPRIM_DEVICE inline auto logical_lane_id() -> + typename std::enable_if::type { - return lane_id()%LogicalWarpSize; + return lane_id() % LogicalWarpSize; } - template<> - ROCPRIM_DEVICE inline - unsigned int logical_lane_id() + template <> + ROCPRIM_DEVICE inline unsigned int logical_lane_id() { return lane_id(); } // Return id of "logical warp" in a block - template - ROCPRIM_DEVICE inline - unsigned int logical_warp_id() + template + ROCPRIM_DEVICE inline unsigned int logical_warp_id() { - return flat_block_thread_id()/LogicalWarpSize; + return flat_block_thread_id() / LogicalWarpSize; } - template<> - ROCPRIM_DEVICE inline - unsigned int logical_warp_id() + template <> + ROCPRIM_DEVICE inline unsigned int logical_warp_id() { return warp_id(); } - ROCPRIM_DEVICE inline - void memory_fence_system() + ROCPRIM_DEVICE inline void memory_fence_system() { ::__threadfence_system(); } - ROCPRIM_DEVICE inline - void memory_fence_block() + ROCPRIM_DEVICE inline void memory_fence_block() { ::__threadfence_block(); } - ROCPRIM_DEVICE inline - void memory_fence_device() + ROCPRIM_DEVICE inline void memory_fence_device() { ::__threadfence(); } diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp index 5e4f07d34..44b3b8691 100644 --- a/rocprim/include/rocprim/intrinsics/warp.hpp +++ b/rocprim/include/rocprim/intrinsics/warp.hpp @@ -33,8 +33,7 @@ BEGIN_ROCPRIM_NAMESPACE /// for the i-th thread of the warp and the i-th thread is active. /// /// \param predicate - input to be evaluated for all active lanes -ROCPRIM_DEVICE inline -unsigned long long ballot(int predicate) +ROCPRIM_DEVICE inline unsigned long long ballot(int predicate) { return ::__ballot(predicate); } @@ -43,8 +42,7 @@ unsigned long long ballot(int predicate) /// /// For each thread, this function returns the number of active threads which /// have i-th bit of \p x set and come before the current thread. -ROCPRIM_DEVICE inline -unsigned int masked_bit_count(unsigned long long x, unsigned int add = 0) +ROCPRIM_DEVICE inline unsigned int masked_bit_count(unsigned long long x, unsigned int add = 0) { int c; c = ::__mbcnt_lo(static_cast(x), add); @@ -55,17 +53,15 @@ unsigned int masked_bit_count(unsigned long long x, unsigned int add = 0) namespace detail { -ROCPRIM_DEVICE inline -int warp_any(int predicate) -{ - return ::__any(predicate); -} + ROCPRIM_DEVICE inline int warp_any(int predicate) + { + return ::__any(predicate); + } -ROCPRIM_DEVICE inline -int warp_all(int predicate) -{ - return ::__all(predicate); -} + ROCPRIM_DEVICE inline int warp_all(int predicate) + { + return ::__all(predicate); + } } // end detail namespace diff --git a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp index 055c43225..e003141c8 100644 --- a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp +++ b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp @@ -34,55 +34,54 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -ROCPRIM_DEVICE inline -T warp_shuffle_op(T input, ShuffleOp&& op) -{ - constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); - - int words[words_no]; - __builtin_memcpy(words, &input, sizeof(T)); - - #pragma unroll - for(int i = 0; i < words_no; i++) + template + ROCPRIM_DEVICE inline T warp_shuffle_op(T input, ShuffleOp&& op) { - words[i] = op(words[i]); - } + constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); - T output; - __builtin_memcpy(&output, words, sizeof(T)); + int words[words_no]; + __builtin_memcpy(words, &input, sizeof(T)); - return output; -} - -ROCPRIM_DEVICE -int __amdgcn_update_dpp(int old, int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) - __asm("llvm.amdgcn.update.dpp.i32"); +#pragma unroll + for(int i = 0; i < words_no; i++) + { + words[i] = op(words[i]); + } -template -ROCPRIM_DEVICE inline -T warp_move_dpp(T input, int dpp_ctrl, - int row_mask = 0xf, int bank_mask = 0xf, bool bound_ctrl = false) -{ - constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); + T output; + __builtin_memcpy(&output, words, sizeof(T)); - int words[words_no]; - __builtin_memcpy(words, &input, sizeof(T)); + return output; + } - #pragma unroll - for(int i = 0; i < words_no; i++) + ROCPRIM_DEVICE + int __amdgcn_update_dpp(int old, + int src, + int dpp_ctrl, + int row_mask, + int bank_mask, + bool bound_ctrl) __asm("llvm.amdgcn.update.dpp.i32"); + + template + ROCPRIM_DEVICE inline T warp_move_dpp( + T input, int dpp_ctrl, int row_mask = 0xf, int bank_mask = 0xf, bool bound_ctrl = false) { - words[i] = __amdgcn_update_dpp( - 0, words[i], - dpp_ctrl, row_mask, bank_mask, bound_ctrl - ); - } + constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); - T output; - __builtin_memcpy(&output, words, sizeof(T)); + int words[words_no]; + __builtin_memcpy(words, &input, sizeof(T)); - return output; -} +#pragma unroll + for(int i = 0; i < words_no; i++) + { + words[i] = __amdgcn_update_dpp(0, words[i], dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } + + T output; + __builtin_memcpy(&output, words, sizeof(T)); + + return output; + } } // end namespace detail @@ -100,17 +99,10 @@ T warp_move_dpp(T input, int dpp_ctrl, /// \param input - input to pass to other threads /// \param src_lane - warp if of a thread whose \p input should be returned /// \param width - logical warp width -template -ROCPRIM_DEVICE inline -T warp_shuffle(T input, const int src_lane, const int width = warp_size()) +template +ROCPRIM_DEVICE inline T warp_shuffle(T input, const int src_lane, const int width = warp_size()) { - return detail::warp_shuffle_op( - input, - [=](int v) -> int - { - return __shfl(v, src_lane, width); - } - ); + return detail::warp_shuffle_op(input, [=](int v) -> int { return __shfl(v, src_lane, width); }); } /// \brief Shuffle up for any data type. @@ -125,17 +117,11 @@ T warp_shuffle(T input, const int src_lane, const int width = warp_size()) /// \param input - input to pass to other threads /// \param delta - offset for calculating source lane id /// \param width - logical warp width -template -ROCPRIM_DEVICE inline -T warp_shuffle_up(T input, const unsigned int delta, const int width = warp_size()) +template +ROCPRIM_DEVICE inline T + warp_shuffle_up(T input, const unsigned int delta, const int width = warp_size()) { - return detail::warp_shuffle_op( - input, - [=](int v) -> int - { - return __shfl_up(v, delta, width); - } - ); + return detail::warp_shuffle_op(input, [=](int v) -> int { return __shfl_up(v, delta, width); }); } /// \brief Shuffle down for any data type. @@ -150,17 +136,12 @@ T warp_shuffle_up(T input, const unsigned int delta, const int width = warp_size /// \param input - input to pass to other threads /// \param delta - offset for calculating source lane id /// \param width - logical warp width -template -ROCPRIM_DEVICE inline -T warp_shuffle_down(T input, const unsigned int delta, const int width = warp_size()) +template +ROCPRIM_DEVICE inline T + warp_shuffle_down(T input, const unsigned int delta, const int width = warp_size()) { - return detail::warp_shuffle_op( - input, - [=](int v) -> int - { - return __shfl_down(v, delta, width); - } - ); + return detail::warp_shuffle_op(input, + [=](int v) -> int { return __shfl_down(v, delta, width); }); } /// \brief Shuffle XOR for any data type. @@ -174,17 +155,12 @@ T warp_shuffle_down(T input, const unsigned int delta, const int width = warp_si /// \param input - input to pass to other threads /// \param lane_mask - mask used for calculating source lane id /// \param width - logical warp width -template -ROCPRIM_DEVICE inline -T warp_shuffle_xor(T input, const int lane_mask, const int width = warp_size()) +template +ROCPRIM_DEVICE inline T + warp_shuffle_xor(T input, const int lane_mask, const int width = warp_size()) { - return detail::warp_shuffle_op( - input, - [=](int v) -> int - { - return __shfl_xor(v, lane_mask, width); - } - ); + return detail::warp_shuffle_op(input, + [=](int v) -> int { return __shfl_xor(v, lane_mask, width); }); } END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/rocprim.hpp b/rocprim/include/rocprim/rocprim.hpp index 753c591cf..bcc9d80c3 100644 --- a/rocprim/include/rocprim/rocprim.hpp +++ b/rocprim/include/rocprim/rocprim.hpp @@ -30,11 +30,11 @@ #include "rocprim_version.hpp" -#include "intrinsics.hpp" #include "functional.hpp" -#include "types.hpp" -#include "type_traits.hpp" +#include "intrinsics.hpp" #include "iterator.hpp" +#include "type_traits.hpp" +#include "types.hpp" #include "warp/warp_reduce.hpp" #include "warp/warp_scan.hpp" @@ -55,11 +55,11 @@ #include "device/device_merge_sort.hpp" #include "device/device_partition.hpp" #include "device/device_radix_sort.hpp" -#include "device/device_reduce_by_key.hpp" #include "device/device_reduce.hpp" +#include "device/device_reduce_by_key.hpp" #include "device/device_run_length_encode.hpp" -#include "device/device_scan_by_key.hpp" #include "device/device_scan.hpp" +#include "device/device_scan_by_key.hpp" #include "device/device_segmented_radix_sort.hpp" #include "device/device_segmented_reduce.hpp" #include "device/device_segmented_scan.hpp" @@ -70,8 +70,7 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Returns version of rocPRIM library. /// \return version of rocPRIM library -ROCPRIM_HOST_DEVICE inline -unsigned int version() +ROCPRIM_HOST_DEVICE inline unsigned int version() { return ROCPRIM_VERSION; } diff --git a/rocprim/include/rocprim/rocprim_version.hpp.in b/rocprim/include/rocprim/rocprim_version.hpp.in index 8b5682982..14c168d53 100644 --- a/rocprim/include/rocprim/rocprim_version.hpp.in +++ b/rocprim/include/rocprim/rocprim_version.hpp.in @@ -32,10 +32,11 @@ /// /// For example, if ROCPRIM_VERSION is 100500, then the major version is 1, /// the minor version is 5, and the patch level is 0. -#define ROCPRIM_VERSION @rocprim_VERSION_MAJOR@ * 100000 + @rocprim_VERSION_MINOR@ * 100 + @rocprim_VERSION_PATCH@ +#define ROCPRIM_VERSION \ + @rocprim_VERSION_MAJOR @ * 100000 + @rocprim_VERSION_MINOR @ * 100 + @rocprim_VERSION_PATCH @ -#define ROCPRIM_VERSION_MAJOR @rocprim_VERSION_MAJOR@ -#define ROCPRIM_VERSION_MINOR @rocprim_VERSION_MINOR@ -#define ROCPRIM_VERSION_PATCH @rocprim_VERSION_PATCH@ +#define ROCPRIM_VERSION_MAJOR @rocprim_VERSION_MAJOR @ +#define ROCPRIM_VERSION_MINOR @rocprim_VERSION_MINOR @ +#define ROCPRIM_VERSION_PATCH @rocprim_VERSION_PATCH @ #endif // ROCPRIM_VERSION_HPP_ \ No newline at end of file diff --git a/rocprim/include/rocprim/type_traits.hpp b/rocprim/include/rocprim/type_traits.hpp index 5cc0ff34d..1dab12291 100644 --- a/rocprim/include/rocprim/type_traits.hpp +++ b/rocprim/include/rocprim/type_traits.hpp @@ -34,70 +34,73 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Behaves like std::is_floating_point, but also includes half-precision /// floating point type (rocprim::half). -template +template struct is_floating_point : std::integral_constant< - bool, - std::is_floating_point::value || - std::is_same<::rocprim::half, typename std::remove_cv::type>::value - > {}; + bool, + std::is_floating_point::value + || std::is_same<::rocprim::half, typename std::remove_cv::type>::value> +{ +}; /// \brief Alias for std::is_integral. -template +template using is_integral = std::is_integral; /// \brief Behaves like std::is_arithmetic, but also includes half-precision /// floating point type (\ref rocprim::half). -template +template struct is_arithmetic : std::integral_constant< - bool, - std::is_arithmetic::value || - std::is_same<::rocprim::half, typename std::remove_cv::type>::value - > {}; + bool, + std::is_arithmetic::value + || std::is_same<::rocprim::half, typename std::remove_cv::type>::value> +{ +}; /// \brief Behaves like std::is_fundamental, but also includes half-precision /// floating point type (\ref rocprim::half). -template +template struct is_fundamental - : std::integral_constant< - bool, - std::is_fundamental::value || - std::is_same<::rocprim::half, typename std::remove_cv::type>::value -> {}; + : std::integral_constant< + bool, + std::is_fundamental::value + || std::is_same<::rocprim::half, typename std::remove_cv::type>::value> +{ +}; /// \brief Alias for std::is_unsigned. -template +template using is_unsigned = std::is_unsigned; /// \brief Behaves like std::is_signed, but also includes half-precision /// floating point type (\ref rocprim::half). -template +template struct is_signed : std::integral_constant< - bool, - std::is_signed::value || - std::is_same<::rocprim::half, typename std::remove_cv::type>::value - > {}; + bool, + std::is_signed::value + || std::is_same<::rocprim::half, typename std::remove_cv::type>::value> +{ +}; /// \brief Behaves like std::is_scalar, but also includes half-precision /// floating point type (\ref rocprim::half). -template +template struct is_scalar : std::integral_constant< - bool, - std::is_scalar::value || - std::is_same<::rocprim::half, typename std::remove_cv::type>::value - > {}; + bool, + std::is_scalar::value + || std::is_same<::rocprim::half, typename std::remove_cv::type>::value> +{ +}; /// \brief Behaves like std::is_compound, but also supports half-precision /// floating point type (\ref rocprim::half). `value` for \ref rocprim::half is `false`. -template -struct is_compound - : std::integral_constant< - bool, - !is_fundamental::value - > {}; +template +struct is_compound : std::integral_constant::value> +{ +}; END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/types.hpp b/rocprim/include/rocprim/types.hpp index 1cd702d42..6128b9ef9 100644 --- a/rocprim/include/rocprim/types.hpp +++ b/rocprim/include/rocprim/types.hpp @@ -42,59 +42,69 @@ namespace detail // Define vector types that will be used by rocPRIM internally. // We don't use HIP vector types because they don't generate correct // load/store operations, see https://github.com/RadeonOpenCompute/ROCm/issues/341 -#define DEFINE_VECTOR_TYPE(name, base) \ -\ -struct name##2 \ -{ \ - typedef base vector_value_type __attribute__((ext_vector_type(2))); \ - union { \ - vector_value_type data; \ - struct { base x, y; }; \ - }; \ -} __attribute__((aligned(sizeof(base) * 2))); \ -\ -struct name##4 \ -{ \ - typedef base vector_value_type __attribute__((ext_vector_type(4))); \ - union { \ - vector_value_type data; \ - struct { base x, y, w, z; }; \ - }; \ -} __attribute__((aligned(sizeof(base) * 4))); - -DEFINE_VECTOR_TYPE(char, char); -DEFINE_VECTOR_TYPE(short, short); -DEFINE_VECTOR_TYPE(int, int); -DEFINE_VECTOR_TYPE(longlong, long long); - -// Takes a scalar type T and matches to a vector type based on NumElements. -template -struct make_vector_type -{ - using type = void; -}; +#define DEFINE_VECTOR_TYPE(name, base) \ + \ + struct name##2 \ + { \ + typedef base vector_value_type __attribute__((ext_vector_type(2))); \ + union \ + { \ + vector_value_type data; \ + struct \ + { \ + base x, y; \ + }; \ + }; \ + } \ + __attribute__((aligned(sizeof(base) * 2))); \ + \ + struct name##4 \ + { \ + typedef base vector_value_type __attribute__((ext_vector_type(4))); \ + union \ + { \ + vector_value_type data; \ + struct \ + { \ + base x, y, w, z; \ + }; \ + }; \ + } \ + __attribute__((aligned(sizeof(base) * 4))); + + DEFINE_VECTOR_TYPE(char, char); + DEFINE_VECTOR_TYPE(short, short); + DEFINE_VECTOR_TYPE(int, int); + DEFINE_VECTOR_TYPE(longlong, long long); + + // Takes a scalar type T and matches to a vector type based on NumElements. + template + struct make_vector_type + { + using type = void; + }; #define DEFINE_MAKE_VECTOR_N_TYPE(name, base, suffix) \ -template<> \ -struct make_vector_type \ -{ \ - using type = name##suffix; \ -}; - -#define DEFINE_MAKE_VECTOR_TYPE(name, base) \ -\ -template <> \ -struct make_vector_type \ -{ \ - using type = base; \ -}; \ -DEFINE_MAKE_VECTOR_N_TYPE(name, base, 2) \ -DEFINE_MAKE_VECTOR_N_TYPE(name, base, 4) - -DEFINE_MAKE_VECTOR_TYPE(char, char); -DEFINE_MAKE_VECTOR_TYPE(short, short); -DEFINE_MAKE_VECTOR_TYPE(int, int); -DEFINE_MAKE_VECTOR_TYPE(longlong, long long); + template <> \ + struct make_vector_type \ + { \ + using type = name##suffix; \ + }; + +#define DEFINE_MAKE_VECTOR_TYPE(name, base) \ + \ + template <> \ + struct make_vector_type \ + { \ + using type = base; \ + }; \ + DEFINE_MAKE_VECTOR_N_TYPE(name, base, 2) \ + DEFINE_MAKE_VECTOR_N_TYPE(name, base, 4) + + DEFINE_MAKE_VECTOR_TYPE(char, char); + DEFINE_MAKE_VECTOR_TYPE(short, short); + DEFINE_MAKE_VECTOR_TYPE(int, int); + DEFINE_MAKE_VECTOR_TYPE(longlong, long long); #undef DEFINE_VECTOR_TYPE #undef DEFINE_MAKE_VECTOR_TYPE @@ -106,7 +116,6 @@ DEFINE_MAKE_VECTOR_TYPE(longlong, long long); /// template parameter should not be used. struct empty_type { - }; /// \brief Half-precision floating point type diff --git a/rocprim/include/rocprim/types/double_buffer.hpp b/rocprim/include/rocprim/types/double_buffer.hpp index 8f9322548..743dfdea7 100644 --- a/rocprim/include/rocprim/types/double_buffer.hpp +++ b/rocprim/include/rocprim/types/double_buffer.hpp @@ -28,45 +28,39 @@ BEGIN_ROCPRIM_NAMESPACE -template +template class double_buffer { - T * buffers[2]; + T* buffers[2]; unsigned int selector; public: - - ROCPRIM_HOST_DEVICE inline - double_buffer() + ROCPRIM_HOST_DEVICE inline double_buffer() { - selector = 0; + selector = 0; buffers[0] = nullptr; buffers[1] = nullptr; } - ROCPRIM_HOST_DEVICE inline - double_buffer(T * current, T * alternate) + ROCPRIM_HOST_DEVICE inline double_buffer(T* current, T* alternate) { - selector = 0; + selector = 0; buffers[0] = current; buffers[1] = alternate; } - ROCPRIM_HOST_DEVICE inline - T * current() const + ROCPRIM_HOST_DEVICE inline T* current() const { return buffers[selector]; } - ROCPRIM_HOST_DEVICE inline - T * alternate() const + ROCPRIM_HOST_DEVICE inline T* alternate() const { return buffers[selector ^ 1]; } - ROCPRIM_HOST_DEVICE inline - void swap() + ROCPRIM_HOST_DEVICE inline void swap() { selector ^= 1; } diff --git a/rocprim/include/rocprim/types/integer_sequence.hpp b/rocprim/include/rocprim/types/integer_sequence.hpp index be5ea7bb3..d743a756b 100644 --- a/rocprim/include/rocprim/types/integer_sequence.hpp +++ b/rocprim/include/rocprim/types/integer_sequence.hpp @@ -28,16 +28,16 @@ BEGIN_ROCPRIM_NAMESPACE #if defined(__cpp_lib_integer_sequence) && !defined(DOXYGEN_SHOULD_SKIP_THIS) // For C++14 or newer we just use standard implementation +using std::index_sequence_for; using std::integer_sequence; -using std::make_integer_sequence; using std::make_index_sequence; -using std::index_sequence_for; +using std::make_integer_sequence; #else /// \brief Compile-time sequence of integers /// /// Implements std::integer_sequence for C++11. When C++14 is supported /// it is just an alias for std::integer_sequence. -template +template class integer_sequence { using value_type = T; @@ -48,43 +48,43 @@ class integer_sequence } }; -template +template using index_sequence = integer_sequence; // DETAILS namespace detail { -template -struct integer_sequence_cat; + template + struct integer_sequence_cat; -template -struct integer_sequence_cat> -{ - using type = typename ::rocprim::integer_sequence; -}; + template + struct integer_sequence_cat> + { + using type = typename ::rocprim::integer_sequence; + }; -template -struct make_integer_sequence_impl : - integer_sequence_cat::type> -{ -}; + template + struct make_integer_sequence_impl + : integer_sequence_cat::type> + { + }; -template -struct make_integer_sequence_impl -{ - using type = ::rocprim::integer_sequence; -}; + template + struct make_integer_sequence_impl + { + using type = ::rocprim::integer_sequence; + }; } // end detail namespace -template +template using make_integer_sequence = typename detail::make_integer_sequence_impl::type; -template +template using make_index_sequence = make_integer_sequence; -template +template using index_sequence_for = make_index_sequence; #endif diff --git a/rocprim/include/rocprim/types/key_value_pair.hpp b/rocprim/include/rocprim/types/key_value_pair.hpp index 7ddaba238..012505728 100644 --- a/rocprim/include/rocprim/types/key_value_pair.hpp +++ b/rocprim/include/rocprim/types/key_value_pair.hpp @@ -28,46 +28,40 @@ BEGIN_ROCPRIM_NAMESPACE -template< - class Key_, - class Value_ -> +template struct key_value_pair { - #ifndef DOXYGEN_SHOULD_SKIP_THIS - using Key = Key_; +#ifndef DOXYGEN_SHOULD_SKIP_THIS + using Key = Key_; using Value = Value_; - #endif +#endif - using key_type = Key_; + using key_type = Key_; using value_type = Value_; - key_type key; + key_type key; value_type value; - ROCPRIM_HOST_DEVICE inline - key_value_pair() = default; + ROCPRIM_HOST_DEVICE inline key_value_pair() = default; - ROCPRIM_HOST_DEVICE inline - ~key_value_pair() = default; + ROCPRIM_HOST_DEVICE inline ~key_value_pair() = default; - ROCPRIM_HOST_DEVICE inline - key_value_pair(const key_type key, const value_type value) : key(key), value(value) + ROCPRIM_HOST_DEVICE inline key_value_pair(const key_type key, const value_type value) + : key(key) + , value(value) { } - #if __hcc_major__ < 1 || __hcc_major__ == 1 && __hcc_minor__ < 2 - ROCPRIM_HOST_DEVICE inline - key_value_pair& operator =(const key_value_pair& kvb) +#if __hcc_major__ < 1 || __hcc_major__ == 1 && __hcc_minor__ < 2 + ROCPRIM_HOST_DEVICE inline key_value_pair& operator=(const key_value_pair& kvb) { - key = kvb.key; + key = kvb.key; value = kvb.value; return *this; } - #endif +#endif - ROCPRIM_HOST_DEVICE inline - bool operator !=(const key_value_pair& kvb) + ROCPRIM_HOST_DEVICE inline bool operator!=(const key_value_pair& kvb) { return (key != kvb.key) || (value != kvb.value); } diff --git a/rocprim/include/rocprim/types/tuple.hpp b/rocprim/include/rocprim/types/tuple.hpp index dccbcc53b..d44dac103 100644 --- a/rocprim/include/rocprim/types/tuple.hpp +++ b/rocprim/include/rocprim/types/tuple.hpp @@ -37,7 +37,7 @@ BEGIN_ROCPRIM_NAMESPACE // //////////////////////// // tuple (FORWARD DECLARATION) // //////////////////////// -template +template class tuple; // //////////////////////// @@ -47,39 +47,34 @@ class tuple; /// \brief Provides access to the number of elements in a tuple as a compile-time constant expression. /// /// tuple_size is undefined for types \p T that are not tuples. -template +template class tuple_size; /// \brief For \p T that is tuple, \p tuple_size::value is the /// the number of elements in a tuple (equal to sizeof...(Types)). /// /// \see std::integral_constant -template -class tuple_size<::rocprim::tuple> : public std::integral_constant +template +class tuple_size<::rocprim::tuple> + : public std::integral_constant { // All member functions of std::integral_constant are constexpr, so it should work // without problems on HIP }; /// const T specialization of \ref tuple_size -template -class tuple_size - : public std::integral_constant::value> +template +class tuple_size : public std::integral_constant::value> { - }; /// volatile T specialization of \ref tuple_size -template -class tuple_size - : public std::integral_constant::value> +template +class tuple_size : public std::integral_constant::value> { - }; /// const volatile T specialization of \ref tuple_size -template -class tuple_size - : public std::integral_constant::value> +template +class tuple_size : public std::integral_constant::value> { - }; // //////////////////////// @@ -89,64 +84,63 @@ class tuple_size /// \brief Provides compile-time indexed access to the types of the elements of the tuple. /// /// tuple_element is undefined for types \p T that are not tuples. -template +template struct tuple_element; // rocprim::tuple_size is defined only for rocprim::tuple namespace detail { -template -struct tuple_element_impl; - -template -struct tuple_element_impl> - : tuple_element_impl> -{ + template + struct tuple_element_impl; -}; + template + struct tuple_element_impl> + : tuple_element_impl> + { + }; -template -struct tuple_element_impl<0, ::rocprim::tuple> -{ - using type = T; -}; + template + struct tuple_element_impl<0, ::rocprim::tuple> + { + using type = T; + }; -template -struct tuple_element_impl> -{ - static_assert(I != I, "tuple_element index out of range"); -}; + template + struct tuple_element_impl> + { + static_assert(I != I, "tuple_element index out of range"); + }; } // end detail namespace /// \brief For \p T that is tuple, \p tuple_element::type is the /// type of Ith element of that tuple. -template +template struct tuple_element> { - /// \brief The type of Ith element of the tuple, where \p I is in [0, sizeof...(Types)) - #ifndef DOXYGEN_SHOULD_SKIP_THIS +/// \brief The type of Ith element of the tuple, where \p I is in [0, sizeof...(Types)) +#ifndef DOXYGEN_SHOULD_SKIP_THIS using type = typename detail::tuple_element_impl>::type; - #else +#else typedef type; - #endif +#endif }; /// const T specialization of \ref tuple_element -template +template struct tuple_element { /// \brief The type of Ith element of the tuple, where \p I is in [0, sizeof...(Types)) using type = typename std::add_const::type>::type; }; /// volatile T specialization of \ref tuple_element -template +template struct tuple_element { /// \brief The type of Ith element of the tuple, where \p I is in [0, sizeof...(Types)) using type = typename std::add_volatile::type>::type; }; /// const volatile T specialization of \ref tuple_element -template +template struct tuple_element { /// \brief The type of Ith element of the tuple, where \p I is in [0, sizeof...(Types)) @@ -158,17 +152,15 @@ using tuple_element_t = typename tuple_element::type; // get forward declaration #ifndef DOXYGEN_SHOULD_SKIP_THIS -template -ROCPRIM_HOST_DEVICE -const tuple_element_t>& get(const tuple&) noexcept; +template +ROCPRIM_HOST_DEVICE const tuple_element_t>& + get(const tuple&) noexcept; -template -ROCPRIM_HOST_DEVICE -tuple_element_t>& get(tuple&) noexcept; +template +ROCPRIM_HOST_DEVICE tuple_element_t>& get(tuple&) noexcept; -template -ROCPRIM_HOST_DEVICE -tuple_element_t>&& get(tuple&&) noexcept; +template +ROCPRIM_HOST_DEVICE tuple_element_t>&& get(tuple&&) noexcept; #endif // //////////////////////// @@ -179,344 +171,268 @@ namespace detail { #ifdef __cpp_lib_is_final - template + template using is_final = std::is_final; #elif defined(__HCC__) // use clang extention - template + template using is_final = std::integral_constant; #else - template + template struct is_final : std::false_type { }; #endif -// tuple_value - represents single element in a tuple -template< - size_t I, - class T, - bool /* Empty base optimization switch */ = std::is_empty::value && !is_final::value -> -struct tuple_value -{ - T value; - - ROCPRIM_HOST_DEVICE inline - constexpr tuple_value() noexcept : value() + // tuple_value - represents single element in a tuple + template ::value + && !is_final::value> + struct tuple_value { - static_assert(!std::is_reference::value, "can't default construct a reference element in a tuple" ); - } + T value; - ROCPRIM_HOST_DEVICE inline - tuple_value(const tuple_value&) = default; + ROCPRIM_HOST_DEVICE inline constexpr tuple_value() noexcept + : value() + { + static_assert(!std::is_reference::value, + "can't default construct a reference element in a tuple"); + } - ROCPRIM_HOST_DEVICE inline - tuple_value(tuple_value&&) = default; + ROCPRIM_HOST_DEVICE inline tuple_value(const tuple_value&) = default; - ROCPRIM_HOST_DEVICE inline - explicit tuple_value(T value) noexcept - : value(value) - { - // This is workaround for hcc which fails during linking without - // this constructor with undefine reference errors when U from ctors - // below is exactly T. Example: - // rocprim::tuple t(1, 2, 3); - // Produced error: - // undefined reference to `rocprim::detail::tuple_value<0ul, int>::tuple_value(int) - } + ROCPRIM_HOST_DEVICE inline tuple_value(tuple_value&&) = default; - template< - class U, - typename = typename std::enable_if< - !std::is_same::type, tuple_value>::value - >::type, - typename = typename std::enable_if< - std::is_constructible::value - >::type - > - ROCPRIM_HOST_DEVICE inline - explicit tuple_value(const U& v) noexcept : value(v) - { - } + ROCPRIM_HOST_DEVICE inline explicit tuple_value(T value) noexcept + : value(value) + { + // This is workaround for hcc which fails during linking without + // this constructor with undefine reference errors when U from ctors + // below is exactly T. Example: + // rocprim::tuple t(1, 2, 3); + // Produced error: + // undefined reference to `rocprim::detail::tuple_value<0ul, int>::tuple_value(int) + } - template< - class U, - typename = typename std::enable_if< - // So U can't be tuple_value - !std::is_same::type, tuple_value>::value - >::type, - typename = typename std::enable_if< - std::is_constructible::value - >::type - > - ROCPRIM_HOST_DEVICE inline - explicit tuple_value(U&& v) noexcept : value(std::forward(v)) - { - } + template ::type, tuple_value>::value>::type, + typename + = typename std::enable_if::value>::type> + ROCPRIM_HOST_DEVICE inline explicit tuple_value(const U& v) noexcept + : value(v) + { + } - ROCPRIM_HOST_DEVICE inline - ~tuple_value() = default; + template + !std::is_same::type, tuple_value>::value>::type, + typename = typename std::enable_if::value>::type> + ROCPRIM_HOST_DEVICE inline explicit tuple_value(U&& v) noexcept + : value(std::forward(v)) + { + } - template - ROCPRIM_HOST_DEVICE inline - tuple_value& operator=(U&& v) noexcept - { - value = std::forward(v); - return *this; - } + ROCPRIM_HOST_DEVICE inline ~tuple_value() = default; - ROCPRIM_HOST_DEVICE inline - void swap(tuple_value& v) noexcept - { - auto tmp = std::move(v.value); - v.value = std::move(this->value); - this->value = std::move(tmp); - } + template + ROCPRIM_HOST_DEVICE inline tuple_value& operator=(U&& v) noexcept + { + value = std::forward(v); + return *this; + } - ROCPRIM_HOST_DEVICE inline - T& get() noexcept - { - return value; - } + ROCPRIM_HOST_DEVICE inline void swap(tuple_value& v) noexcept + { + auto tmp = std::move(v.value); + v.value = std::move(this->value); + this->value = std::move(tmp); + } - ROCPRIM_HOST_DEVICE inline - const T& get() const noexcept - { - return value; - } -}; + ROCPRIM_HOST_DEVICE inline T& get() noexcept + { + return value; + } -// Specialization for empty base optimization -template -struct tuple_value : private T -{ - ROCPRIM_HOST_DEVICE inline - constexpr tuple_value() noexcept : T() + ROCPRIM_HOST_DEVICE inline const T& get() const noexcept + { + return value; + } + }; + + // Specialization for empty base optimization + template + struct tuple_value : private T { - static_assert(!std::is_reference::value, "can't default construct a reference element in a tuple" ); - } + ROCPRIM_HOST_DEVICE inline constexpr tuple_value() noexcept + : T() + { + static_assert(!std::is_reference::value, + "can't default construct a reference element in a tuple"); + } - ROCPRIM_HOST_DEVICE inline - tuple_value(const tuple_value&) = default; + ROCPRIM_HOST_DEVICE inline tuple_value(const tuple_value&) = default; - ROCPRIM_HOST_DEVICE inline - tuple_value(tuple_value&&) = default; + ROCPRIM_HOST_DEVICE inline tuple_value(tuple_value&&) = default; - ROCPRIM_HOST_DEVICE inline - explicit tuple_value(T value) noexcept - : T(value) - { - // This is workaround for hcc which fails during linking without - // this constructor with undefine reference errors when U from ctors - // below is exactly T. Example: - // rocprim::tuple t(1, 2, 3); - // Produced error: - // undefined reference to `rocprim::detail::tuple_value<0ul, int>::tuple_value(int) - } + ROCPRIM_HOST_DEVICE inline explicit tuple_value(T value) noexcept + : T(value) + { + // This is workaround for hcc which fails during linking without + // this constructor with undefine reference errors when U from ctors + // below is exactly T. Example: + // rocprim::tuple t(1, 2, 3); + // Produced error: + // undefined reference to `rocprim::detail::tuple_value<0ul, int>::tuple_value(int) + } - template< - class U, - typename = typename std::enable_if< - !std::is_same::type, tuple_value>::value - >::type, - typename = typename std::enable_if< - std::is_constructible::value - >::type - > - ROCPRIM_HOST_DEVICE inline - explicit tuple_value(const U& v) noexcept : T(v) - { - } + template ::type, tuple_value>::value>::type, + typename + = typename std::enable_if::value>::type> + ROCPRIM_HOST_DEVICE inline explicit tuple_value(const U& v) noexcept + : T(v) + { + } - template< - class U, - typename = typename std::enable_if< - // So U can't be tuple_value - !std::is_same::type, tuple_value>::value - >::type, - typename = typename std::enable_if< - std::is_constructible::value - >::type - > - ROCPRIM_HOST_DEVICE inline - explicit tuple_value(U&& v) noexcept : T(std::forward(v)) - { - } + template + !std::is_same::type, tuple_value>::value>::type, + typename = typename std::enable_if::value>::type> + ROCPRIM_HOST_DEVICE inline explicit tuple_value(U&& v) noexcept + : T(std::forward(v)) + { + } - ROCPRIM_HOST_DEVICE inline - ~tuple_value() = default; + ROCPRIM_HOST_DEVICE inline ~tuple_value() = default; - template - ROCPRIM_HOST_DEVICE inline - tuple_value& operator=(U&& v) noexcept - { - T::operator=(std::forward(v)); - return *this; - } + template + ROCPRIM_HOST_DEVICE inline tuple_value& operator=(U&& v) noexcept + { + T::operator=(std::forward(v)); + return *this; + } - ROCPRIM_HOST_DEVICE inline - void swap(tuple_value& v) noexcept - { - auto tmp = std::move(v); - v = std::move(*this); - *this = std::move(tmp); - } + ROCPRIM_HOST_DEVICE inline void swap(tuple_value& v) noexcept + { + auto tmp = std::move(v); + v = std::move(*this); + *this = std::move(tmp); + } - ROCPRIM_HOST_DEVICE inline - T& get() noexcept - { - return static_cast(*this); - } + ROCPRIM_HOST_DEVICE inline T& get() noexcept + { + return static_cast(*this); + } + + ROCPRIM_HOST_DEVICE inline const T& get() const noexcept + { + return static_cast(*this); + } + }; - ROCPRIM_HOST_DEVICE inline - const T& get() const noexcept + template + ROCPRIM_HOST_DEVICE inline void swallow(Types&&...) noexcept { - return static_cast(*this); } -}; - -template -ROCPRIM_HOST_DEVICE inline -void swallow(Types&&...) noexcept {} -template -struct tuple_impl; + template + struct tuple_impl; -template -struct tuple_impl<::rocprim::index_sequence, Types...> - : tuple_value... -{ - ROCPRIM_HOST_DEVICE inline - constexpr tuple_impl() = default; + template + struct tuple_impl<::rocprim::index_sequence, Types...> + : tuple_value... + { + ROCPRIM_HOST_DEVICE inline constexpr tuple_impl() = default; - ROCPRIM_HOST_DEVICE inline - tuple_impl(const tuple_impl&) = default; + ROCPRIM_HOST_DEVICE inline tuple_impl(const tuple_impl&) = default; - ROCPRIM_HOST_DEVICE inline - tuple_impl(tuple_impl&&) = default; + ROCPRIM_HOST_DEVICE inline tuple_impl(tuple_impl&&) = default; - ROCPRIM_HOST_DEVICE inline - explicit tuple_impl(Types... values) - : tuple_value(values)... - { - // This is workaround for hcc which fails during linking without - // this constructor with undefine reference errors when UTypes - // are exactly Types (see constructor below). Example: - // rocprim::tuple t(1, 2, 3); - // Produced error: - // undefined reference to `rocprim::detail::tuple_impl< - // rocprim::integer_sequence, int, int, int - // >::tuple_impl(int, int, int)' - } + ROCPRIM_HOST_DEVICE inline explicit tuple_impl(Types... values) + : tuple_value(values)... + { + // This is workaround for hcc which fails during linking without + // this constructor with undefine reference errors when UTypes + // are exactly Types (see constructor below). Example: + // rocprim::tuple t(1, 2, 3); + // Produced error: + // undefined reference to `rocprim::detail::tuple_impl< + // rocprim::integer_sequence, int, int, int + // >::tuple_impl(int, int, int)' + } - template< - class... UTypes, - typename = typename std::enable_if< - sizeof...(UTypes) == sizeof...(Types) - >::type, - typename = typename std::enable_if< - sizeof...(Types) >= 1 - >::type - > - ROCPRIM_HOST_DEVICE inline - explicit tuple_impl(UTypes&&... values) - : tuple_value(std::forward(values))... - { - } + template ::type, + typename = typename std::enable_if= 1>::type> + ROCPRIM_HOST_DEVICE inline explicit tuple_impl(UTypes&&... values) + : tuple_value(std::forward(values))... + { + } - template< - class... UTypes, - typename = typename std::enable_if< - sizeof...(UTypes) == sizeof...(Types) - >::type, - typename = typename std::enable_if< - sizeof...(Types) >= 1 - >::type - > - ROCPRIM_HOST_DEVICE inline - tuple_impl(::rocprim::tuple&& other) - : tuple_value(std::forward(::rocprim::get(other)))... - { - } + template ::type, + typename = typename std::enable_if= 1>::type> + ROCPRIM_HOST_DEVICE inline tuple_impl(::rocprim::tuple&& other) + : tuple_value(std::forward(::rocprim::get(other)))... + { + } - template< - class... UTypes, - typename = typename std::enable_if< - sizeof...(UTypes) == sizeof...(Types) - >::type, - typename = typename std::enable_if< - sizeof...(Types) >= 1 - >::type - > - ROCPRIM_HOST_DEVICE inline - tuple_impl(const ::rocprim::tuple& other) - : tuple_value(::rocprim::get(other))... - { - } + template ::type, + typename = typename std::enable_if= 1>::type> + ROCPRIM_HOST_DEVICE inline tuple_impl(const ::rocprim::tuple& other) + : tuple_value(::rocprim::get(other))... + { + } - ROCPRIM_HOST_DEVICE inline - ~tuple_impl() = default; + ROCPRIM_HOST_DEVICE inline ~tuple_impl() = default; - ROCPRIM_HOST_DEVICE inline - tuple_impl& operator=(const tuple_impl& other) noexcept - { - swallow( - tuple_value::operator=( - static_cast&>(other).get() - )... - ); - return *this; - } + ROCPRIM_HOST_DEVICE inline tuple_impl& operator=(const tuple_impl& other) noexcept + { + swallow(tuple_value::operator=( + static_cast&>(other).get())...); + return *this; + } - ROCPRIM_HOST_DEVICE inline - tuple_impl& operator=(tuple_impl&& other) noexcept - { - swallow( - tuple_value::operator=( - static_cast&>(other).get() - )... - ); - return *this; - } + ROCPRIM_HOST_DEVICE inline tuple_impl& operator=(tuple_impl&& other) noexcept + { + swallow(tuple_value::operator=( + static_cast&>(other).get())...); + return *this; + } - template - ROCPRIM_HOST_DEVICE inline - tuple_impl& operator=(const ::rocprim::tuple& other) noexcept - { - swallow(tuple_value::operator=(::rocprim::get(other))...); - return *this; - } + template + ROCPRIM_HOST_DEVICE inline tuple_impl& + operator=(const ::rocprim::tuple& other) noexcept + { + swallow(tuple_value::operator=(::rocprim::get(other))...); + return *this; + } - template - ROCPRIM_HOST_DEVICE inline - tuple_impl& operator=(::rocprim::tuple&& other) noexcept - { - swallow( - tuple_value::operator=( - ::rocprim::get(std::move(other)) - )... - ); - return *this; - } + template + ROCPRIM_HOST_DEVICE inline tuple_impl& + operator=(::rocprim::tuple&& other) noexcept + { + swallow(tuple_value::operator=( + ::rocprim::get(std::move(other)))...); + return *this; + } - ROCPRIM_HOST_DEVICE inline - tuple_impl& swap(tuple_impl& other) noexcept - { - swallow( - (static_cast&>(*this).swap( - static_cast&>(other) - ), 0)... - ); - return *this; - } -}; + ROCPRIM_HOST_DEVICE inline tuple_impl& swap(tuple_impl& other) noexcept + { + swallow((static_cast&>(*this).swap( + static_cast&>(other)), + 0)...); + return *this; + } + }; -template -using tuple_base = - tuple_impl< - typename ::rocprim::index_sequence_for, - Types... - >; + template + using tuple_base = tuple_impl, Types...>; } // end detail namespace @@ -529,70 +445,64 @@ using tuple_base = /// construction, copy and move assignment, and swapping. /// /// \see std::tuple -template +template class tuple { using base_type = detail::tuple_base; // tuple_impl base_type base; - template + template struct check_constructor { - template + template static constexpr bool enable_default() { return detail::all_true::value...>::value; } - template + template static constexpr bool enable_copy() { return detail::all_true::value...>::value; } }; - #ifndef DOXYGEN_SHOULD_SKIP_THIS - template - ROCPRIM_HOST_DEVICE - friend const tuple_element_t>& get(const tuple&) noexcept; +#ifndef DOXYGEN_SHOULD_SKIP_THIS + template + ROCPRIM_HOST_DEVICE friend const tuple_element_t>& + get(const tuple&) noexcept; - template - ROCPRIM_HOST_DEVICE - friend tuple_element_t>& get(tuple&) noexcept; + template + ROCPRIM_HOST_DEVICE friend tuple_element_t>& + get(tuple&) noexcept; - template - ROCPRIM_HOST_DEVICE - friend tuple_element_t>&& get(tuple&&) noexcept; - #endif + template + ROCPRIM_HOST_DEVICE friend tuple_element_t>&& + get(tuple&&) noexcept; +#endif public: - /// \brief Default constructor. Performs value-initialization of all elements. - /// - /// This overload only participates in overload resolution if: - /// * std::is_default_constructible::value is \p true for all \p i. - #ifndef DOXYGEN_SHOULD_SKIP_THIS - template< - class Dummy = void, - typename = typename std::enable_if< - check_constructor::template enable_default() - >::type - > - #endif - ROCPRIM_HOST_DEVICE inline - constexpr tuple() noexcept : base() {}; +/// \brief Default constructor. Performs value-initialization of all elements. +/// +/// This overload only participates in overload resolution if: +/// * std::is_default_constructible::value is \p true for all \p i. +#ifndef DOXYGEN_SHOULD_SKIP_THIS + template ::template enable_default()>::type> +#endif + ROCPRIM_HOST_DEVICE inline constexpr tuple() noexcept + : base() {}; /// \brief Implicitly-defined copy constructor. - ROCPRIM_HOST_DEVICE inline - tuple(const tuple&) = default; + ROCPRIM_HOST_DEVICE inline tuple(const tuple&) = default; /// \brief Implicitly-defined move constructor. - ROCPRIM_HOST_DEVICE inline - tuple(tuple&&) = default; + ROCPRIM_HOST_DEVICE inline tuple(tuple&&) = default; - #ifndef DOXYGEN_SHOULD_SKIP_THIS - ROCPRIM_HOST_DEVICE inline - explicit tuple(Types... values) noexcept +#ifndef DOXYGEN_SHOULD_SKIP_THIS + ROCPRIM_HOST_DEVICE inline explicit tuple(Types... values) noexcept : base(values...) { // Workaround for HCC compiler, without this we get undefined reference @@ -601,23 +511,19 @@ class tuple // Produces error: // 'undefined reference to `rocprim::tuple::tuple(int, double)' } - #endif +#endif - /// \brief Direct constructor. Initializes each element of the tuple with - /// the corresponding input value. - /// - /// This overload only participates in overload resolution if: - /// * std::is_copy_constructible::value is \p true for all \p i. - #ifndef DOXYGEN_SHOULD_SKIP_THIS - template< - class Dummy = void, - typename = typename std::enable_if< - check_constructor::template enable_copy() - >::type - > - #endif - ROCPRIM_HOST_DEVICE inline - explicit tuple(const Types&... values) +/// \brief Direct constructor. Initializes each element of the tuple with +/// the corresponding input value. +/// +/// This overload only participates in overload resolution if: +/// * std::is_copy_constructible::value is \p true for all \p i. +#ifndef DOXYGEN_SHOULD_SKIP_THIS + template ::template enable_copy()>::type> +#endif + ROCPRIM_HOST_DEVICE inline explicit tuple(const Types&... values) : base(values...) { } @@ -629,22 +535,16 @@ class tuple /// * sizeof...(Types) == sizeof...(UTypes), /// * sizeof...(Types) >= 1, and /// * std::is_constructible::value is \p true for all \p i. - template< - class... UTypes - #ifndef DOXYGEN_SHOULD_SKIP_THIS - ,typename = typename std::enable_if< - sizeof...(UTypes) == sizeof...(Types) - >::type, - typename = typename std::enable_if< - sizeof...(Types) >= 1 - >::type, - typename = typename std::enable_if< - detail::all_true::value...>::value - >::type - #endif - > - ROCPRIM_HOST_DEVICE inline - explicit tuple(UTypes&&... values) noexcept + template ::type, + typename = typename std::enable_if= 1>::type, + typename = typename std::enable_if< + detail::all_true::value...>::value>::type +#endif + > + ROCPRIM_HOST_DEVICE inline explicit tuple(UTypes&&... values) noexcept : base(std::forward(values)...) { } @@ -656,22 +556,15 @@ class tuple /// * sizeof...(Types) == sizeof...(UTypes), /// * sizeof...(Types) >= 1, and /// * std::is_constructible::value is \p true for all \p i. - template< - class... UTypes, - #ifndef DOXYGEN_SHOULD_SKIP_THIS - typename = typename std::enable_if< - sizeof...(UTypes) == sizeof...(Types) - >::type, - typename = typename std::enable_if< - sizeof...(Types) >= 1 - >::type, - typename = typename std::enable_if< - detail::all_true::value...>::value - >::type - #endif - > - ROCPRIM_HOST_DEVICE inline - tuple(const tuple& other) noexcept + template ::type, + typename = typename std::enable_if= 1>::type, + typename = typename std::enable_if::value...>::value>::type +#endif + > + ROCPRIM_HOST_DEVICE inline tuple(const tuple& other) noexcept : base(other) { } @@ -683,39 +576,26 @@ class tuple /// * sizeof...(Types) == sizeof...(UTypes), /// * sizeof...(Types) >= 1, and /// * std::is_constructible::value is \p true for all \p i. - template< - class... UTypes, - #ifndef DOXYGEN_SHOULD_SKIP_THIS - typename = typename std::enable_if< - sizeof...(UTypes) == sizeof...(Types) - >::type, - typename = typename std::enable_if< - sizeof...(Types) >= 1 - >::type, - typename = typename std::enable_if< - detail::all_true::value...>::value - >::type - #endif - > - ROCPRIM_HOST_DEVICE inline - tuple(tuple&& other) noexcept + template ::type, + typename = typename std::enable_if= 1>::type, + typename = typename std::enable_if< + detail::all_true::value...>::value>::type +#endif + > + ROCPRIM_HOST_DEVICE inline tuple(tuple&& other) noexcept : base(std::forward>(other)) { } /// \brief Implicitly-defined destructor. - ROCPRIM_HOST_DEVICE inline - ~tuple() noexcept = default; - - #ifndef DOXYGEN_SHOULD_SKIP_THIS - template< - class T, - typename = typename std::enable_if< - std::is_assignable::value - >::type - > - ROCPRIM_HOST_DEVICE inline - tuple& operator=(T&& v) noexcept + ROCPRIM_HOST_DEVICE inline ~tuple() noexcept = default; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS + template ::value>::type> + ROCPRIM_HOST_DEVICE inline tuple& operator=(T&& v) noexcept { base = std::forward(v); return *this; @@ -726,7 +606,7 @@ class tuple base = other.base; return *this; } - #else // For documentation +#else // For documentation /// \brief Copy assignment operator. /// \param other tuple to replace the contents of this tuple tuple& operator=(const tuple& other) noexcept; @@ -735,13 +615,13 @@ class tuple tuple& operator=(tuple&& other) noexcept; /// \brief For all \p i, assigns \p rocprim::get(other) to \p rocprim::get(*this). /// \param other tuple to replace the contents of this tuple - template + template tuple& operator=(const tuple& other) noexcept; /// \brief For all \p i, assigns \p std::forward(get(other)) to \p rocprim::get(*this). /// \param other tuple to replace the contents of this tuple - template + template tuple& operator=(tuple&& other) noexcept; - #endif +#endif /// \brief Swaps the content of the tuple (\p *this) with the content \p other /// \param other tuple of values to swap @@ -752,76 +632,65 @@ class tuple }; #ifndef DOXYGEN_SHOULD_SKIP_THIS -template<> +template <> class tuple<> { public: - ROCPRIM_HOST_DEVICE inline - constexpr tuple() noexcept - { - } + ROCPRIM_HOST_DEVICE inline constexpr tuple() noexcept {} - ROCPRIM_HOST_DEVICE inline - ~tuple() = default; + ROCPRIM_HOST_DEVICE inline ~tuple() = default; - ROCPRIM_HOST_DEVICE inline - void swap(tuple&) noexcept - { - } + ROCPRIM_HOST_DEVICE inline void swap(tuple&) noexcept {} }; #endif namespace detail { -template -struct tuple_equal_to -{ - template - ROCPRIM_HOST_DEVICE inline - bool operator()(const T& lhs, const U& rhs) const + template + struct tuple_equal_to { - return tuple_equal_to()(lhs, rhs) && get(lhs) == get(rhs); - } -}; + template + ROCPRIM_HOST_DEVICE inline bool operator()(const T& lhs, const U& rhs) const + { + return tuple_equal_to()(lhs, rhs) && get(lhs) == get(rhs); + } + }; -template<> -struct tuple_equal_to<0> -{ - template - ROCPRIM_HOST_DEVICE inline - bool operator()(const T&, const U&) const + template <> + struct tuple_equal_to<0> { - return true; - } -}; + template + ROCPRIM_HOST_DEVICE inline bool operator()(const T&, const U&) const + { + return true; + } + }; -template -struct tuple_less_than -{ - template - ROCPRIM_HOST_DEVICE inline - bool operator()(const T& lhs, const U& rhs) const + template + struct tuple_less_than { - constexpr size_t idx = tuple_size::value - I; - if(get(lhs) < get(rhs)) - return true; - if(get(rhs) < get(lhs)) - return false; - return tuple_less_than()(lhs, rhs); - } -}; + template + ROCPRIM_HOST_DEVICE inline bool operator()(const T& lhs, const U& rhs) const + { + constexpr size_t idx = tuple_size::value - I; + if(get(lhs) < get(rhs)) + return true; + if(get(rhs) < get(lhs)) + return false; + return tuple_less_than()(lhs, rhs); + } + }; -template<> -struct tuple_less_than<0> -{ - template - ROCPRIM_HOST_DEVICE inline - bool operator()(const T&, const U&) const + template <> + struct tuple_less_than<0> { - return false; - } -}; + template + ROCPRIM_HOST_DEVICE inline bool operator()(const T&, const U&) const + { + return false; + } + }; } // end namespace detail @@ -838,15 +707,10 @@ struct tuple_less_than<0> /// \return \p true if rocprim::get(lhs) == rocprim::get(rhs) for all /// \p i in [0, sizeof...(TTypes)); otherwise - \p false. Comparing two /// empty tuples returns \p true. -template< - class... TTypes, - class... UTypes, - typename = typename std::enable_if< - sizeof...(TTypes) == sizeof...(UTypes) - >::type -> -ROCPRIM_HOST_DEVICE inline -bool operator==(const tuple& lhs, const tuple& rhs) +template ::type> +ROCPRIM_HOST_DEVICE inline bool operator==(const tuple& lhs, const tuple& rhs) { return detail::tuple_equal_to()(lhs, rhs); } @@ -863,9 +727,8 @@ bool operator==(const tuple& lhs, const tuple& rhs) /// \param lhs tuple to compare with \p rhs /// \param rhs tuple to compare with \p lhs /// \return !(lhr == rhs) -template -ROCPRIM_HOST_DEVICE inline -bool operator!=(const tuple& lhs, const tuple& rhs) +template +ROCPRIM_HOST_DEVICE inline bool operator!=(const tuple& lhs, const tuple& rhs) { return !(lhs == rhs); } @@ -883,15 +746,10 @@ bool operator!=(const tuple& lhs, const tuple& rhs) /// (!(bool)(rocprim::get<0>(rhs) < rocprim::get<0>(lhs)) && lhstail < rhstail), where /// \p lhstail is \p lhs without its first element, and \p rhstail is \p rhs without its first /// element. For two empty tuples, it returns \p false. -template< - class... TTypes, - class... UTypes, - typename = typename std::enable_if< - sizeof...(TTypes) == sizeof...(UTypes) - >::type -> -ROCPRIM_HOST_DEVICE inline -bool operator<(const tuple& lhs, const tuple& rhs) +template ::type> +ROCPRIM_HOST_DEVICE inline bool operator<(const tuple& lhs, const tuple& rhs) { return detail::tuple_less_than()(lhs, rhs); } @@ -906,9 +764,8 @@ bool operator<(const tuple& lhs, const tuple& rhs) /// \param lhs tuple to compare with \p rhs /// \param rhs tuple to compare with \p lhs /// \return rhs < lhs -template -ROCPRIM_HOST_DEVICE inline -bool operator>(const tuple& lhs, const tuple& rhs) +template +ROCPRIM_HOST_DEVICE inline bool operator>(const tuple& lhs, const tuple& rhs) { return rhs < lhs; } @@ -923,9 +780,8 @@ bool operator>(const tuple& lhs, const tuple& rhs) /// \param lhs tuple to compare with \p rhs /// \param rhs tuple to compare with \p lhs /// \return !(rhs < lhs) -template -ROCPRIM_HOST_DEVICE inline -bool operator<=(const tuple& lhs, const tuple& rhs) +template +ROCPRIM_HOST_DEVICE inline bool operator<=(const tuple& lhs, const tuple& rhs) { return !(rhs < lhs); } @@ -940,9 +796,8 @@ bool operator<=(const tuple& lhs, const tuple& rhs) /// \param lhs tuple to compare with \p rhs /// \param rhs tuple to compare with \p lhs /// \return !(lhs < rhs) -template -ROCPRIM_HOST_DEVICE inline -bool operator>=(const tuple& lhs, const tuple& rhs) +template +ROCPRIM_HOST_DEVICE inline bool operator>=(const tuple& lhs, const tuple& rhs) { return !(lhs < rhs); } @@ -953,9 +808,8 @@ bool operator>=(const tuple& lhs, const tuple& rhs) /// \brief Swaps the content of \p lhs tuple with the content \p rhs /// \param lhs,rhs tuples whose contents to swap -template -ROCPRIM_HOST_DEVICE inline -void swap(tuple& lhs, tuple& rhs) noexcept +template +ROCPRIM_HOST_DEVICE inline void swap(tuple& lhs, tuple& rhs) noexcept { lhs.swap(rhs); } @@ -968,9 +822,9 @@ void swap(tuple& lhs, tuple& rhs) noexcept /// an integer value from range [0, sizeof...(Types)). /// \param t tuple whose contents to extract /// \return constant refernce to the selected element of input tuple \p t. -template -ROCPRIM_HOST_DEVICE inline -const tuple_element_t>& get(const tuple& t) noexcept +template +ROCPRIM_HOST_DEVICE inline const tuple_element_t>& + get(const tuple& t) noexcept { using type = detail::tuple_value>>; return static_cast(t.base).get(); @@ -980,9 +834,8 @@ const tuple_element_t>& get(const tuple& t) noexcep /// an integer value from range [0, sizeof...(Types)). /// \param t tuple whose contents to extract /// \return refernce to the selected element of input tuple \p t. -template -ROCPRIM_HOST_DEVICE inline -tuple_element_t>& get(tuple& t) noexcept +template +ROCPRIM_HOST_DEVICE inline tuple_element_t>& get(tuple& t) noexcept { using type = detail::tuple_value>>; return static_cast(t.base).get(); @@ -992,12 +845,11 @@ tuple_element_t>& get(tuple& t) noexcept /// an integer value from range [0, sizeof...(Types)). /// \param t tuple whose contents to extract /// \return rvalue refernce to the selected element of input tuple \p t. -template -ROCPRIM_HOST_DEVICE inline -tuple_element_t>&& get(tuple&& t) noexcept +template +ROCPRIM_HOST_DEVICE inline tuple_element_t>&& get(tuple&& t) noexcept { using value_type = tuple_element_t>; - using type = detail::tuple_value>>; + using type = detail::tuple_value>>; return static_cast(static_cast(t.base).get()); } @@ -1008,27 +860,27 @@ tuple_element_t>&& get(tuple&& t) noexcept namespace detail { -template -struct make_tuple_return -{ - using type = T; -}; + template + struct make_tuple_return + { + using type = T; + }; -template -struct make_tuple_return> -{ - using type = T&; -}; + template + struct make_tuple_return> + { + using type = T&; + }; -template -using make_tuple_return_t = typename make_tuple_return::type>::type; + template + using make_tuple_return_t = typename make_tuple_return::type>::type; } // end detail namespace #ifndef DOXYGEN_SHOULD_SKIP_THIS -template -ROCPRIM_HOST_DEVICE inline -tuple...> make_tuple(Types&&... args) noexcept +template +ROCPRIM_HOST_DEVICE inline tuple...> + make_tuple(Types&&... args) noexcept { return tuple...>(std::forward(args)...); } @@ -1043,7 +895,7 @@ tuple...> make_tuple(Types&&... args) noexcep /// \param args - zero or more arguments to create tuple from /// /// \see std::tuple -template +template tuple make_tuple(Types&&... args); #endif @@ -1054,21 +906,18 @@ tuple make_tuple(Types&&... args); namespace detail { -struct ignore_t -{ - ROCPRIM_HOST_DEVICE inline - ignore_t() = default; + struct ignore_t + { + ROCPRIM_HOST_DEVICE inline ignore_t() = default; - ROCPRIM_HOST_DEVICE inline - ~ignore_t() = default; + ROCPRIM_HOST_DEVICE inline ~ignore_t() = default; - template - ROCPRIM_HOST_DEVICE inline - const ignore_t& operator=(const T&) const - { - return *this; - } -}; + template + ROCPRIM_HOST_DEVICE inline const ignore_t& operator=(const T&) const + { + return *this; + } + }; } #ifndef DOXYGEN_SHOULD_SKIP_THIS @@ -1094,9 +943,8 @@ const ignore_type ignore; /// \param args - zero or more input lvalue references used to create tuple /// /// \see std::tie -template -ROCPRIM_HOST_DEVICE inline -tuple tie(Types&... args) noexcept +template +ROCPRIM_HOST_DEVICE inline tuple tie(Types&... args) noexcept { return ::rocprim::tuple(args...); } diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_crosslane.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_crosslane.hpp index 8e989a6f4..b8f036146 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_crosslane.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_crosslane.hpp @@ -33,18 +33,14 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize, - bool UseAllReduce, - bool UseDPP = ROCPRIM_DETAIL_USE_DPP -> -using warp_reduce_crosslane = - typename std::conditional< - UseDPP, - warp_reduce_dpp, - warp_reduce_shuffle - >::type; + template + using warp_reduce_crosslane = + typename std::conditional, + warp_reduce_shuffle>::type; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp index b1d5a8deb..0cd5f51de 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp @@ -24,9 +24,9 @@ #include #include "../../config.hpp" +#include "../../detail/various.hpp" #include "../../intrinsics.hpp" #include "../../types.hpp" -#include "../../detail/various.hpp" #include "warp_reduce_shuffle.hpp" @@ -35,123 +35,118 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize, - bool UseAllReduce -> -class warp_reduce_dpp -{ -public: - static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); - - using storage_type = detail::empty_storage_type; - - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, BinaryFunction reduce_op) + template + class warp_reduce_dpp { - output = input; + public: + static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); + + using storage_type = detail::empty_storage_type; - if(WarpSize > 1) + template + ROCPRIM_DEVICE inline void reduce(T input, T& output, BinaryFunction reduce_op) { - // quad_perm:[1,0,3,2] -> 10110001 - output = reduce_op(warp_move_dpp(output, 0xb1), output); + output = input; + + if(WarpSize > 1) + { + // quad_perm:[1,0,3,2] -> 10110001 + output = reduce_op(warp_move_dpp(output, 0xb1), output); + } + if(WarpSize > 2) + { + // quad_perm:[2,3,0,1] -> 01001110 + output = reduce_op(warp_move_dpp(output, 0x4e), output); + } + if(WarpSize > 4) + { + // row_shr:4 + output = reduce_op(warp_move_dpp(output, 0x114), output); + } + if(WarpSize > 8) + { + // row_shr:8 + output = reduce_op(warp_move_dpp(output, 0x118), output); + } + if(WarpSize > 16) + { + // row_bcast:15 + output = reduce_op(warp_move_dpp(output, 0x142), output); + } + if(WarpSize > 32) + { + // row_bcast:31 + output = reduce_op(warp_move_dpp(output, 0x143), output); + } + + // Read the result from the last lane of the logical warp + output = warp_shuffle(output, WarpSize - 1, WarpSize); } - if(WarpSize > 2) + + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) { - // quad_perm:[2,3,0,1] -> 01001110 - output = reduce_op(warp_move_dpp(output, 0x4e), output); + (void)storage; // disables unused parameter warning + this->reduce(input, output, reduce_op); } - if(WarpSize > 4) + + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op) { - // row_shr:4 - output = reduce_op(warp_move_dpp(output, 0x114), output); + // Fallback to shuffle-based implementation + warp_reduce_shuffle().reduce( + input, output, valid_items, reduce_op); } - if(WarpSize > 8) + + template + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op) { - // row_shr:8 - output = reduce_op(warp_move_dpp(output, 0x118), output); + (void)storage; // disables unused parameter warning + this->reduce(input, output, valid_items, reduce_op); } - if(WarpSize > 16) + + template + ROCPRIM_DEVICE inline void + head_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) { - // row_bcast:15 - output = reduce_op(warp_move_dpp(output, 0x142), output); + // Fallback to shuffle-based implementation + warp_reduce_shuffle().head_segmented_reduce( + input, output, flag, reduce_op); } - if(WarpSize > 32) + + template + ROCPRIM_DEVICE inline void + tail_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) { - // row_bcast:31 - output = reduce_op(warp_move_dpp(output, 0x143), output); + // Fallback to shuffle-based implementation + warp_reduce_shuffle().tail_segmented_reduce( + input, output, flag, reduce_op); } - // Read the result from the last lane of the logical warp - output = warp_shuffle(output, WarpSize - 1, WarpSize); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) - { - (void) storage; // disables unused parameter warning - this->reduce(input, output, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op) - { - // Fallback to shuffle-based implementation - warp_reduce_shuffle() - .reduce(input, output, valid_items, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, unsigned int valid_items, - storage_type& storage, BinaryFunction reduce_op) - { - (void) storage; // disables unused parameter warning - this->reduce(input, output, valid_items, reduce_op); - } + template + ROCPRIM_DEVICE inline void head_segmented_reduce( + T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op) + { + // Fallback to shuffle-based implementation + warp_reduce_shuffle().head_segmented_reduce( + input, output, flag, storage, reduce_op); + } - template - ROCPRIM_DEVICE inline - void head_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) - { - // Fallback to shuffle-based implementation - warp_reduce_shuffle() - .head_segmented_reduce(input, output, flag, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void tail_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) - { - // Fallback to shuffle-based implementation - warp_reduce_shuffle() - .tail_segmented_reduce(input, output, flag, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void head_segmented_reduce(T input, T& output, Flag flag, - storage_type& storage, BinaryFunction reduce_op) - { - // Fallback to shuffle-based implementation - warp_reduce_shuffle() - .head_segmented_reduce(input, output, flag, storage, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void tail_segmented_reduce(T input, T& output, Flag flag, - storage_type& storage, BinaryFunction reduce_op) - { - // Fallback to shuffle-based implementation - warp_reduce_shuffle() - .tail_segmented_reduce(input, output, flag, storage, reduce_op); - } -}; + template + ROCPRIM_DEVICE inline void tail_segmented_reduce( + T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op) + { + // Fallback to shuffle-based implementation + warp_reduce_shuffle().tail_segmented_reduce( + input, output, flag, storage, reduce_op); + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp index 880d39a8d..3988e2e8f 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp @@ -24,9 +24,9 @@ #include #include "../../config.hpp" +#include "../../detail/various.hpp" #include "../../intrinsics.hpp" #include "../../types.hpp" -#include "../../detail/various.hpp" #include "warp_segment_bounds.hpp" @@ -35,130 +35,123 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize, - bool UseAllReduce -> -class warp_reduce_shared_mem -{ - struct storage_type_ + template + class warp_reduce_shared_mem { - T values[WarpSize]; - }; + struct storage_type_ + { + T values[WarpSize]; + }; -public: - using storage_type = detail::raw_storage; + public: + using storage_type = detail::raw_storage; - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) - { - constexpr unsigned int ceiling = next_power_of_two(WarpSize); - const unsigned int lid = detail::logical_lane_id(); - storage_type_& storage_ = storage.get(); - - output = input; - store_volatile(&storage_.values[lid], output); - #pragma unroll - for(unsigned int i = ceiling >> 1; i > 0; i >>= 1) + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) { - if (lid + i < WarpSize && lid < i) + constexpr unsigned int ceiling = next_power_of_two(WarpSize); + const unsigned int lid = detail::logical_lane_id(); + storage_type_& storage_ = storage.get(); + + output = input; + store_volatile(&storage_.values[lid], output); +#pragma unroll + for(unsigned int i = ceiling >> 1; i > 0; i >>= 1) { - output = load_volatile(&storage_.values[lid]); - T other = load_volatile(&storage_.values[lid + i]); - output = reduce_op(output, other); - store_volatile(&storage_.values[lid], output); + if(lid + i < WarpSize && lid < i) + { + output = load_volatile(&storage_.values[lid]); + T other = load_volatile(&storage_.values[lid + i]); + output = reduce_op(output, other); + store_volatile(&storage_.values[lid], output); + } } + set_output(output, storage); } - set_output(output, storage); - } - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, unsigned int valid_items, - storage_type& storage, BinaryFunction reduce_op) - { - constexpr unsigned int ceiling = next_power_of_two(WarpSize); - const unsigned int lid = detail::logical_lane_id(); - storage_type_& storage_ = storage.get(); - - output = input; - store_volatile(&storage_.values[lid], output); - #pragma unroll - for(unsigned int i = ceiling >> 1; i > 0; i >>= 1) + template + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op) { - if((lid + i) < WarpSize && lid < i && (lid + i) < valid_items) + constexpr unsigned int ceiling = next_power_of_two(WarpSize); + const unsigned int lid = detail::logical_lane_id(); + storage_type_& storage_ = storage.get(); + + output = input; + store_volatile(&storage_.values[lid], output); +#pragma unroll + for(unsigned int i = ceiling >> 1; i > 0; i >>= 1) { - output = load_volatile(&storage_.values[lid]); - T other = load_volatile(&storage_.values[lid + i]); - output = reduce_op(output, other); - store_volatile(&storage_.values[lid], output); + if((lid + i) < WarpSize && lid < i && (lid + i) < valid_items) + { + output = load_volatile(&storage_.values[lid]); + T other = load_volatile(&storage_.values[lid + i]); + output = reduce_op(output, other); + store_volatile(&storage_.values[lid], output); + } } + set_output(output, storage); } - set_output(output, storage); - } - template - ROCPRIM_DEVICE inline - void head_segmented_reduce(T input, T& output, Flag flag, - storage_type& storage, BinaryFunction reduce_op) - { - this->segmented_reduce(input, output, flag, storage, reduce_op); - } + template + ROCPRIM_DEVICE inline void head_segmented_reduce( + T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op) + { + this->segmented_reduce(input, output, flag, storage, reduce_op); + } - template - ROCPRIM_DEVICE inline - void tail_segmented_reduce(T input, T& output, Flag flag, - storage_type& storage, BinaryFunction reduce_op) - { - this->segmented_reduce(input, output, flag, storage, reduce_op); - } - -private: - template - ROCPRIM_DEVICE inline - void segmented_reduce(T input, T& output, Flag flag, - storage_type& storage, BinaryFunction reduce_op) - { - const unsigned int lid = detail::logical_lane_id(); - constexpr unsigned int ceiling = next_power_of_two(WarpSize); - storage_type_& storage_ = storage.get(); - // Get logical lane id of the last valid value in the segment - auto last = last_in_warp_segment(flag); - - output = input; - #pragma unroll - for(unsigned int i = 1; i < ceiling; i *= 2) + template + ROCPRIM_DEVICE inline void tail_segmented_reduce( + T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op) { - store_volatile(&storage_.values[lid], output); - if((lid + i) <= last) + this->segmented_reduce(input, output, flag, storage, reduce_op); + } + + private: + template + ROCPRIM_DEVICE inline void segmented_reduce( + T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op) + { + const unsigned int lid = detail::logical_lane_id(); + constexpr unsigned int ceiling = next_power_of_two(WarpSize); + storage_type_& storage_ = storage.get(); + // Get logical lane id of the last valid value in the segment + auto last = last_in_warp_segment(flag); + + output = input; +#pragma unroll + for(unsigned int i = 1; i < ceiling; i *= 2) { - T other = load_volatile(&storage_.values[lid + i]); - output = reduce_op(output, other); + store_volatile(&storage_.values[lid], output); + if((lid + i) <= last) + { + T other = load_volatile(&storage_.values[lid + i]); + output = reduce_op(output, other); + } } } - } - template - ROCPRIM_DEVICE inline - typename std::enable_if<(Switch == false)>::type - set_output(T& output, storage_type& storage) - { - (void) output; - (void) storage; - // output already set correctly - } - - template - ROCPRIM_DEVICE inline - typename std::enable_if<(Switch == true)>::type - set_output(T& output, storage_type& storage) - { - storage_type_& storage_ = storage.get(); - output = load_volatile(&storage_.values[0]); - } -}; + template + ROCPRIM_DEVICE inline typename std::enable_if<(Switch == false)>::type + set_output(T& output, storage_type& storage) + { + (void)output; + (void)storage; + // output already set correctly + } + + template + ROCPRIM_DEVICE inline typename std::enable_if<(Switch == true)>::type + set_output(T& output, storage_type& storage) + { + storage_type_& storage_ = storage.get(); + output = load_volatile(&storage_.values[0]); + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp index c67cc4a89..54bffbea4 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp @@ -24,9 +24,9 @@ #include #include "../../config.hpp" +#include "../../detail/various.hpp" #include "../../intrinsics.hpp" #include "../../types.hpp" -#include "../../detail/various.hpp" #include "warp_segment_bounds.hpp" @@ -35,128 +35,120 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize, - bool UseAllReduce -> -class warp_reduce_shuffle -{ -public: - static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); + template + class warp_reduce_shuffle + { + public: + static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); - using storage_type = detail::empty_storage_type; + using storage_type = detail::empty_storage_type; - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, BinaryFunction reduce_op) - { - output = input; + template + ROCPRIM_DEVICE inline void reduce(T input, T& output, BinaryFunction reduce_op) + { + output = input; + + T value; +#pragma unroll + for(unsigned int offset = 1; offset < WarpSize; offset *= 2) + { + value = warp_shuffle_down(output, offset, WarpSize); + output = reduce_op(output, value); + } + set_output(output); + } - T value; - #pragma unroll - for(unsigned int offset = 1; offset < WarpSize; offset *= 2) + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) { - value = warp_shuffle_down(output, offset, WarpSize); - output = reduce_op(output, value); + (void)storage; // disables unused parameter warning + this->reduce(input, output, reduce_op); } - set_output(output); - } - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) - { - (void) storage; // disables unused parameter warning - this->reduce(input, output, reduce_op); - } + template + ROCPRIM_DEVICE inline void + reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op) + { + output = input; + + T value; +#pragma unroll + for(unsigned int offset = 1; offset < WarpSize; offset *= 2) + { + value = warp_shuffle_down(output, offset, WarpSize); + unsigned int id = detail::logical_lane_id(); + if(id + offset < valid_items) + output = reduce_op(output, value); + } + set_output(output); + } - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op) - { - output = input; + template + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + unsigned int valid_items, + storage_type& storage, + BinaryFunction reduce_op) + { + (void)storage; // disables unused parameter warning + this->reduce(input, output, valid_items, reduce_op); + } - T value; - #pragma unroll - for(unsigned int offset = 1; offset < WarpSize; offset *= 2) + template + ROCPRIM_DEVICE inline void + head_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) { - value = warp_shuffle_down(output, offset, WarpSize); - unsigned int id = detail::logical_lane_id(); - if (id + offset < valid_items) output = reduce_op(output, value); + this->segmented_reduce(input, output, flag, reduce_op); } - set_output(output); - } - template - ROCPRIM_DEVICE inline - void reduce(T input, T& output, unsigned int valid_items, - storage_type& storage, BinaryFunction reduce_op) - { - (void) storage; // disables unused parameter warning - this->reduce(input, output, valid_items, reduce_op); - } + template + ROCPRIM_DEVICE inline void + tail_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) + { + this->segmented_reduce(input, output, flag, reduce_op); + } - template - ROCPRIM_DEVICE inline - void head_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) - { - this->segmented_reduce(input, output, flag, reduce_op); - } + template + ROCPRIM_DEVICE inline void head_segmented_reduce( + T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op) + { + (void)storage; + this->segmented_reduce(input, output, flag, reduce_op); + } - template - ROCPRIM_DEVICE inline - void tail_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) - { - this->segmented_reduce(input, output, flag, reduce_op); - } + template + ROCPRIM_DEVICE inline void tail_segmented_reduce( + T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op) + { + (void)storage; + this->segmented_reduce(input, output, flag, reduce_op); + } - template - ROCPRIM_DEVICE inline - void head_segmented_reduce(T input, T& output, Flag flag, - storage_type& storage, BinaryFunction reduce_op) - { - (void) storage; - this->segmented_reduce(input, output, flag, reduce_op); - } - - template - ROCPRIM_DEVICE inline - void tail_segmented_reduce(T input, T& output, Flag flag, - storage_type& storage, BinaryFunction reduce_op) - { - (void) storage; - this->segmented_reduce(input, output, flag, reduce_op); - } - -private: - template - ROCPRIM_DEVICE inline - void segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) - { - // Get logical lane id of the last valid value in the segment, - // and convert it to number of valid values in segment. - auto valid_items_in_segment = last_in_warp_segment(flag) + 1U; - this->reduce(input, output, valid_items_in_segment, reduce_op); - } - - template - ROCPRIM_DEVICE inline - typename std::enable_if<(Switch == false)>::type - set_output(T& output) - { - (void) output; - // output already set correctly - } - - template - ROCPRIM_DEVICE inline - typename std::enable_if<(Switch == true)>::type - set_output(T& output) - { - output = warp_shuffle(output, 0, WarpSize); - } -}; + private: + template + ROCPRIM_DEVICE inline void + segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op) + { + // Get logical lane id of the last valid value in the segment, + // and convert it to number of valid values in segment. + auto valid_items_in_segment = last_in_warp_segment(flag) + 1U; + this->reduce(input, output, valid_items_in_segment, reduce_op); + } + + template + ROCPRIM_DEVICE inline typename std::enable_if<(Switch == false)>::type set_output(T& output) + { + (void)output; + // output already set correctly + } + + template + ROCPRIM_DEVICE inline typename std::enable_if<(Switch == true)>::type set_output(T& output) + { + output = warp_shuffle(output, 0, WarpSize); + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_crosslane.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_crosslane.hpp index f6b6df3ea..03c805fda 100644 --- a/rocprim/include/rocprim/warp/detail/warp_scan_crosslane.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_scan_crosslane.hpp @@ -33,17 +33,9 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize, - bool UseDPP = ROCPRIM_DETAIL_USE_DPP -> -using warp_scan_crosslane = - typename std::conditional< - UseDPP, - warp_scan_dpp, - warp_scan_shuffle - >::type; + template + using warp_scan_crosslane = typename std:: + conditional, warp_scan_shuffle>::type; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp index 17314f976..1d286d40f 100644 --- a/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp @@ -34,226 +34,230 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize -> -class warp_scan_dpp -{ -public: - static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); - - using storage_type = detail::empty_storage_type; - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, BinaryFunction scan_op) + template + class warp_scan_dpp { - const unsigned int lane_id = ::rocprim::lane_id(); - const unsigned int row_lane_id = lane_id % ::rocprim::min(16u, WarpSize); + public: + static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); - output = input; + using storage_type = detail::empty_storage_type; - if(WarpSize > 1) + template + ROCPRIM_DEVICE inline void inclusive_scan(T input, T& output, BinaryFunction scan_op) { - T t = scan_op(warp_move_dpp(output, 0x111), output); // row_shr:1 - if(row_lane_id >= 1) output = t; + const unsigned int lane_id = ::rocprim::lane_id(); + const unsigned int row_lane_id = lane_id % ::rocprim::min(16u, WarpSize); + + output = input; + + if(WarpSize > 1) + { + T t = scan_op(warp_move_dpp(output, 0x111), output); // row_shr:1 + if(row_lane_id >= 1) + output = t; + } + if(WarpSize > 2) + { + T t = scan_op(warp_move_dpp(output, 0x112), output); // row_shr:2 + if(row_lane_id >= 2) + output = t; + } + if(WarpSize > 4) + { + T t = scan_op(warp_move_dpp(output, 0x114), output); // row_shr:4 + if(row_lane_id >= 4) + output = t; + } + if(WarpSize > 8) + { + T t = scan_op(warp_move_dpp(output, 0x118), output); // row_shr:8 + if(row_lane_id >= 8) + output = t; + } + if(WarpSize > 16) + { + T t = scan_op(warp_move_dpp(output, 0x142), output); // row_bcast:15 + if(lane_id % 32 >= 16) + output = t; + } + if(WarpSize > 32) + { + T t = scan_op(warp_move_dpp(output, 0x143), output); // row_bcast:31 + if(lane_id >= 32) + output = t; + } } - if(WarpSize > 2) + + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) { - T t = scan_op(warp_move_dpp(output, 0x112), output); // row_shr:2 - if(row_lane_id >= 2) output = t; + (void)storage; // disables unused parameter warning + inclusive_scan(input, output, scan_op); } - if(WarpSize > 4) + + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, T& reduction, BinaryFunction scan_op) { - T t = scan_op(warp_move_dpp(output, 0x114), output); // row_shr:4 - if(row_lane_id >= 4) output = t; + inclusive_scan(input, output, scan_op); + // Broadcast value from the last thread in warp + reduction = warp_shuffle(output, WarpSize - 1, WarpSize); } - if(WarpSize > 8) + + template + ROCPRIM_DEVICE inline void inclusive_scan( + T input, T& output, T& reduction, storage_type& storage, BinaryFunction scan_op) { - T t = scan_op(warp_move_dpp(output, 0x118), output); // row_shr:8 - if(row_lane_id >= 8) output = t; + (void)storage; + inclusive_scan(input, output, reduction, scan_op); } - if(WarpSize > 16) + + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, BinaryFunction scan_op) { - T t = scan_op(warp_move_dpp(output, 0x142), output); // row_bcast:15 - if(lane_id % 32 >= 16) output = t; + inclusive_scan(input, output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(output, output, init, scan_op); } - if(WarpSize > 32) + + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, storage_type& storage, BinaryFunction scan_op) { - T t = scan_op(warp_move_dpp(output, 0x143), output); // row_bcast:31 - if(lane_id >= 32) output = t; + (void)storage; // disables unused parameter warning + exclusive_scan(input, output, init, scan_op); } - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - inclusive_scan(input, output, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, T& reduction, - BinaryFunction scan_op) - { - inclusive_scan(input, output, scan_op); - // Broadcast value from the last thread in warp - reduction = warp_shuffle(output, WarpSize-1, WarpSize); - } - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; - inclusive_scan(input, output, reduction, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, BinaryFunction scan_op) - { - inclusive_scan(input, output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(output, output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - exclusive_scan(input, output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - inclusive_scan(input, output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(output, output); - } + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) + { + (void)storage; // disables unused parameter warning + inclusive_scan(input, output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(output, output); + } - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, T& reduction, - BinaryFunction scan_op) - { - inclusive_scan(input, output, scan_op); - // Broadcast value from the last thread in warp - reduction = warp_shuffle(output, WarpSize-1, WarpSize); - // Convert inclusive scan result to exclusive - to_exclusive(output, output, init, scan_op); - } + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, T& reduction, BinaryFunction scan_op) + { + inclusive_scan(input, output, scan_op); + // Broadcast value from the last thread in warp + reduction = warp_shuffle(output, WarpSize - 1, WarpSize); + // Convert inclusive scan result to exclusive + to_exclusive(output, output, init, scan_op); + } - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; - exclusive_scan(input, output, init, reduction, scan_op); - } + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, T& reduction, storage_type& storage, BinaryFunction scan_op) + { + (void)storage; + exclusive_scan(input, output, init, reduction, scan_op); + } - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, - BinaryFunction scan_op) - { - inclusive_scan(input, inclusive_output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(inclusive_output, exclusive_output, init, scan_op); - } + template + ROCPRIM_DEVICE inline void + scan(T input, T& inclusive_output, T& exclusive_output, T init, BinaryFunction scan_op) + { + inclusive_scan(input, inclusive_output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(inclusive_output, exclusive_output, init, scan_op); + } - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - scan(input, inclusive_output, exclusive_output, init, scan_op); - } + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + storage_type& storage, + BinaryFunction scan_op) + { + (void)storage; // disables unused parameter warning + scan(input, inclusive_output, exclusive_output, init, scan_op); + } - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - inclusive_scan(input, inclusive_output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(inclusive_output, exclusive_output); - } + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + storage_type& storage, + BinaryFunction scan_op) + { + (void)storage; // disables unused parameter warning + inclusive_scan(input, inclusive_output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(inclusive_output, exclusive_output); + } - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction, - BinaryFunction scan_op) - { - inclusive_scan(input, inclusive_output, scan_op); - // Broadcast value from the last thread in warp - reduction = warp_shuffle(inclusive_output, WarpSize-1, WarpSize); - // Convert inclusive scan result to exclusive - to_exclusive(inclusive_output, exclusive_output, init, scan_op); - } + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + T& reduction, + BinaryFunction scan_op) + { + inclusive_scan(input, inclusive_output, scan_op); + // Broadcast value from the last thread in warp + reduction = warp_shuffle(inclusive_output, WarpSize - 1, WarpSize); + // Convert inclusive scan result to exclusive + to_exclusive(inclusive_output, exclusive_output, init, scan_op); + } - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; - scan(input, inclusive_output, exclusive_output, init, reduction, scan_op); - } + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op) + { + (void)storage; + scan(input, inclusive_output, exclusive_output, init, reduction, scan_op); + } - ROCPRIM_DEVICE inline - T broadcast(T input, const unsigned int src_lane, storage_type& storage) - { - (void) storage; - return warp_shuffle(input, src_lane, WarpSize); - } + ROCPRIM_DEVICE inline T + broadcast(T input, const unsigned int src_lane, storage_type& storage) + { + (void)storage; + return warp_shuffle(input, src_lane, WarpSize); + } -protected: - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) - { - (void) storage; - return to_exclusive(inclusive_input, exclusive_output); - } + protected: + ROCPRIM_DEVICE inline void + to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) + { + (void)storage; + return to_exclusive(inclusive_input, exclusive_output); + } -private: - // Changes inclusive scan results to exclusive scan results - template - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output, T init, - BinaryFunction scan_op) - { - // include init value in scan results - exclusive_output = scan_op(init, inclusive_input); - // get exclusive results - exclusive_output = warp_shuffle_up(exclusive_output, 1, WarpSize); - if(detail::logical_lane_id() == 0) + private: + // Changes inclusive scan results to exclusive scan results + template + ROCPRIM_DEVICE inline void + to_exclusive(T inclusive_input, T& exclusive_output, T init, BinaryFunction scan_op) { - exclusive_output = init; + // include init value in scan results + exclusive_output = scan_op(init, inclusive_input); + // get exclusive results + exclusive_output = warp_shuffle_up(exclusive_output, 1, WarpSize); + if(detail::logical_lane_id() == 0) + { + exclusive_output = init; + } } - } - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output) - { - // shift to get exclusive results - exclusive_output = warp_shuffle_up(inclusive_input, 1, WarpSize); - } -}; + ROCPRIM_DEVICE inline void to_exclusive(T inclusive_input, T& exclusive_output) + { + // shift to get exclusive results + exclusive_output = warp_shuffle_up(inclusive_input, 1, WarpSize); + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_shared_mem.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_shared_mem.hpp index afe2bc94a..6f30315b1 100644 --- a/rocprim/include/rocprim/warp/detail/warp_scan_shared_mem.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_scan_shared_mem.hpp @@ -34,155 +34,155 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize -> -class warp_scan_shared_mem -{ - struct storage_type_ + template + class warp_scan_shared_mem { - T threads[WarpSize]; - }; -public: - using storage_type = detail::raw_storage; + struct storage_type_ + { + T threads[WarpSize]; + }; - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, - storage_type& storage, BinaryFunction scan_op) - { - const unsigned int lid = detail::logical_lane_id(); - storage_type_& storage_ = storage.get(); + public: + using storage_type = detail::raw_storage; - T me = input; - store_volatile(&storage_.threads[lid], me); - for(unsigned int i = 1; i < WarpSize; i *= 2) + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) { - if(lid >= i) + const unsigned int lid = detail::logical_lane_id(); + storage_type_& storage_ = storage.get(); + + T me = input; + store_volatile(&storage_.threads[lid], me); + for(unsigned int i = 1; i < WarpSize; i *= 2) { - T other = load_volatile(&storage_.threads[lid - i]); - me = scan_op(other, me); - store_volatile(&storage_.threads[lid], me); + if(lid >= i) + { + T other = load_volatile(&storage_.threads[lid - i]); + me = scan_op(other, me); + store_volatile(&storage_.threads[lid], me); + } } + output = me; } - output = me; - } - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - inclusive_scan(input, output, storage, scan_op); - reduction = load_volatile(&storage_.threads[WarpSize - 1]); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, - storage_type& storage, BinaryFunction scan_op) - { - inclusive_scan(input, output, storage, scan_op); - to_exclusive(output, init, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, - storage_type& storage, BinaryFunction scan_op) - { - inclusive_scan(input, output, storage, scan_op); - to_exclusive(output, storage); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - inclusive_scan(input, output, storage, scan_op); - reduction = load_volatile(&storage_.threads[WarpSize - 1]); - to_exclusive(output, init, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, - storage_type& storage, BinaryFunction scan_op) - { - inclusive_scan(input, inclusive_output, storage, scan_op); - to_exclusive(exclusive_output, init, storage, scan_op); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, - storage_type& storage, BinaryFunction scan_op) - { - inclusive_scan(input, inclusive_output, storage, scan_op); - to_exclusive(exclusive_output, storage); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - storage_type_& storage_ = storage.get(); - inclusive_scan(input, inclusive_output, storage, scan_op); - reduction = load_volatile(&storage_.threads[WarpSize - 1]); - to_exclusive(exclusive_output, init, storage, scan_op); - } - - ROCPRIM_DEVICE inline - T broadcast(T input, const unsigned int src_lane, storage_type& storage) - { - storage_type_& storage_ = storage.get(); - if(src_lane == detail::logical_lane_id()) + template + ROCPRIM_DEVICE inline void inclusive_scan( + T input, T& output, T& reduction, storage_type& storage, BinaryFunction scan_op) { - store_volatile(&storage_.threads[src_lane], input); + storage_type_& storage_ = storage.get(); + inclusive_scan(input, output, storage, scan_op); + reduction = load_volatile(&storage_.threads[WarpSize - 1]); } - return load_volatile(&storage_.threads[src_lane]); - } -protected: - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) - { - (void) inclusive_input; - return to_exclusive(exclusive_output, storage); - } - -private: - // Calculate exclusive results base on inclusive scan results in storage.threads[]. - template - ROCPRIM_DEVICE inline - void to_exclusive(T& exclusive_output, T init, - storage_type& storage, BinaryFunction scan_op) - { - const unsigned int lid = detail::logical_lane_id(); - storage_type_& storage_ = storage.get(); - exclusive_output = init; - if(lid != 0) + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, storage_type& storage, BinaryFunction scan_op) { - exclusive_output = scan_op(init, load_volatile(&storage_.threads[lid-1])); + inclusive_scan(input, output, storage, scan_op); + to_exclusive(output, init, storage, scan_op); } - } - ROCPRIM_DEVICE inline - void to_exclusive(T& exclusive_output, storage_type& storage) - { - const unsigned int lid = detail::logical_lane_id(); - storage_type_& storage_ = storage.get(); - if(lid != 0) + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) + { + inclusive_scan(input, output, storage, scan_op); + to_exclusive(output, storage); + } + + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, T& reduction, storage_type& storage, BinaryFunction scan_op) + { + storage_type_& storage_ = storage.get(); + inclusive_scan(input, output, storage, scan_op); + reduction = load_volatile(&storage_.threads[WarpSize - 1]); + to_exclusive(output, init, storage, scan_op); + } + + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + storage_type& storage, + BinaryFunction scan_op) { - exclusive_output = load_volatile(&storage_.threads[lid-1]); + inclusive_scan(input, inclusive_output, storage, scan_op); + to_exclusive(exclusive_output, init, storage, scan_op); } - } -}; + + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + storage_type& storage, + BinaryFunction scan_op) + { + inclusive_scan(input, inclusive_output, storage, scan_op); + to_exclusive(exclusive_output, storage); + } + + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op) + { + storage_type_& storage_ = storage.get(); + inclusive_scan(input, inclusive_output, storage, scan_op); + reduction = load_volatile(&storage_.threads[WarpSize - 1]); + to_exclusive(exclusive_output, init, storage, scan_op); + } + + ROCPRIM_DEVICE inline T + broadcast(T input, const unsigned int src_lane, storage_type& storage) + { + storage_type_& storage_ = storage.get(); + if(src_lane == detail::logical_lane_id()) + { + store_volatile(&storage_.threads[src_lane], input); + } + return load_volatile(&storage_.threads[src_lane]); + } + + protected: + ROCPRIM_DEVICE inline void + to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) + { + (void)inclusive_input; + return to_exclusive(exclusive_output, storage); + } + + private: + // Calculate exclusive results base on inclusive scan results in storage.threads[]. + template + ROCPRIM_DEVICE inline void + to_exclusive(T& exclusive_output, T init, storage_type& storage, BinaryFunction scan_op) + { + const unsigned int lid = detail::logical_lane_id(); + storage_type_& storage_ = storage.get(); + exclusive_output = init; + if(lid != 0) + { + exclusive_output = scan_op(init, load_volatile(&storage_.threads[lid - 1])); + } + } + + ROCPRIM_DEVICE inline void to_exclusive(T& exclusive_output, storage_type& storage) + { + const unsigned int lid = detail::logical_lane_id(); + storage_type_& storage_ = storage.get(); + if(lid != 0) + { + exclusive_output = load_volatile(&storage_.threads[lid - 1]); + } + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp index f604d41f7..b97568e00 100644 --- a/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp @@ -34,201 +34,200 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class T, - unsigned int WarpSize -> -class warp_scan_shuffle -{ -public: - static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); + template + class warp_scan_shuffle + { + public: + static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); - using storage_type = detail::empty_storage_type; + using storage_type = detail::empty_storage_type; - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, BinaryFunction scan_op) - { - output = input; + template + ROCPRIM_DEVICE inline void inclusive_scan(T input, T& output, BinaryFunction scan_op) + { + output = input; + + T value; + const unsigned int id = detail::logical_lane_id(); +#pragma unroll + for(unsigned int offset = 1; offset < WarpSize; offset *= 2) + { + value = warp_shuffle_up(output, offset, WarpSize); + if(id >= offset) + output = scan_op(value, output); + } + } - T value; - const unsigned int id = detail::logical_lane_id(); - #pragma unroll - for(unsigned int offset = 1; offset < WarpSize; offset *= 2) + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) { - value = warp_shuffle_up(output, offset, WarpSize); - if(id >= offset) output = scan_op(value, output); + (void)storage; // disables unused parameter warning + inclusive_scan(input, output, scan_op); } - } - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - inclusive_scan(input, output, scan_op); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, T& reduction, - BinaryFunction scan_op) - { - inclusive_scan(input, output, scan_op); - // Broadcast value from the last thread in warp - reduction = warp_shuffle(output, WarpSize-1, WarpSize); - } - - template - ROCPRIM_DEVICE inline - void inclusive_scan(T input, T& output, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; - inclusive_scan(input, output, reduction, scan_op); - } + template + ROCPRIM_DEVICE inline void + inclusive_scan(T input, T& output, T& reduction, BinaryFunction scan_op) + { + inclusive_scan(input, output, scan_op); + // Broadcast value from the last thread in warp + reduction = warp_shuffle(output, WarpSize - 1, WarpSize); + } - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, BinaryFunction scan_op) - { - inclusive_scan(input, output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(output, output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - exclusive_scan(input, output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - inclusive_scan(input, output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(output, output); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, T& reduction, - BinaryFunction scan_op) - { - inclusive_scan(input, output, scan_op); - // Broadcast value from the last thread in warp - reduction = warp_shuffle(output, WarpSize-1, WarpSize); - // Convert inclusive scan result to exclusive - to_exclusive(output, output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void exclusive_scan(T input, T& output, T init, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; - exclusive_scan(input, output, init, reduction, scan_op); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, - BinaryFunction scan_op) - { - inclusive_scan(input, inclusive_output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(inclusive_output, exclusive_output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - scan(input, inclusive_output, exclusive_output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; // disables unused parameter warning - inclusive_scan(input, inclusive_output, scan_op); - // Convert inclusive scan result to exclusive - to_exclusive(inclusive_output, exclusive_output); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction, - BinaryFunction scan_op) - { - inclusive_scan(input, inclusive_output, scan_op); - // Broadcast value from the last thread in warp - reduction = warp_shuffle(inclusive_output, WarpSize-1, WarpSize); - // Convert inclusive scan result to exclusive - to_exclusive(inclusive_output, exclusive_output, init, scan_op); - } - - template - ROCPRIM_DEVICE inline - void scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction, - storage_type& storage, BinaryFunction scan_op) - { - (void) storage; - scan(input, inclusive_output, exclusive_output, init, reduction, scan_op); - } + template + ROCPRIM_DEVICE inline void inclusive_scan( + T input, T& output, T& reduction, storage_type& storage, BinaryFunction scan_op) + { + (void)storage; + inclusive_scan(input, output, reduction, scan_op); + } - ROCPRIM_DEVICE inline - T broadcast(T input, const unsigned int src_lane, storage_type& storage) - { - (void) storage; - return warp_shuffle(input, src_lane, WarpSize); - } + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, BinaryFunction scan_op) + { + inclusive_scan(input, output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(output, output, init, scan_op); + } -protected: - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) - { - (void) storage; - return to_exclusive(inclusive_input, exclusive_output); - } - -private: - // Changes inclusive scan results to exclusive scan results - template - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output, T init, - BinaryFunction scan_op) - { - // include init value in scan results - exclusive_output = scan_op(init, inclusive_input); - // get exclusive results - exclusive_output = warp_shuffle_up(exclusive_output, 1, WarpSize); - if(detail::logical_lane_id() == 0) + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, storage_type& storage, BinaryFunction scan_op) { - exclusive_output = init; + (void)storage; // disables unused parameter warning + exclusive_scan(input, output, init, scan_op); } - } - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output) - { - // shift to get exclusive results - exclusive_output = warp_shuffle_up(inclusive_input, 1, WarpSize); - } -}; + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op) + { + (void)storage; // disables unused parameter warning + inclusive_scan(input, output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(output, output); + } + + template + ROCPRIM_DEVICE inline void + exclusive_scan(T input, T& output, T init, T& reduction, BinaryFunction scan_op) + { + inclusive_scan(input, output, scan_op); + // Broadcast value from the last thread in warp + reduction = warp_shuffle(output, WarpSize - 1, WarpSize); + // Convert inclusive scan result to exclusive + to_exclusive(output, output, init, scan_op); + } + + template + ROCPRIM_DEVICE inline void exclusive_scan( + T input, T& output, T init, T& reduction, storage_type& storage, BinaryFunction scan_op) + { + (void)storage; + exclusive_scan(input, output, init, reduction, scan_op); + } + + template + ROCPRIM_DEVICE inline void + scan(T input, T& inclusive_output, T& exclusive_output, T init, BinaryFunction scan_op) + { + inclusive_scan(input, inclusive_output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(inclusive_output, exclusive_output, init, scan_op); + } + + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + storage_type& storage, + BinaryFunction scan_op) + { + (void)storage; // disables unused parameter warning + scan(input, inclusive_output, exclusive_output, init, scan_op); + } + + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + storage_type& storage, + BinaryFunction scan_op) + { + (void)storage; // disables unused parameter warning + inclusive_scan(input, inclusive_output, scan_op); + // Convert inclusive scan result to exclusive + to_exclusive(inclusive_output, exclusive_output); + } + + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + T& reduction, + BinaryFunction scan_op) + { + inclusive_scan(input, inclusive_output, scan_op); + // Broadcast value from the last thread in warp + reduction = warp_shuffle(inclusive_output, WarpSize - 1, WarpSize); + // Convert inclusive scan result to exclusive + to_exclusive(inclusive_output, exclusive_output, init, scan_op); + } + + template + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op) + { + (void)storage; + scan(input, inclusive_output, exclusive_output, init, reduction, scan_op); + } + + ROCPRIM_DEVICE inline T + broadcast(T input, const unsigned int src_lane, storage_type& storage) + { + (void)storage; + return warp_shuffle(input, src_lane, WarpSize); + } + + protected: + ROCPRIM_DEVICE inline void + to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) + { + (void)storage; + return to_exclusive(inclusive_input, exclusive_output); + } + + private: + // Changes inclusive scan results to exclusive scan results + template + ROCPRIM_DEVICE inline void + to_exclusive(T inclusive_input, T& exclusive_output, T init, BinaryFunction scan_op) + { + // include init value in scan results + exclusive_output = scan_op(init, inclusive_input); + // get exclusive results + exclusive_output = warp_shuffle_up(exclusive_output, 1, WarpSize); + if(detail::logical_lane_id() == 0) + { + exclusive_output = init; + } + } + + ROCPRIM_DEVICE inline void to_exclusive(T inclusive_input, T& exclusive_output) + { + // shift to get exclusive results + exclusive_output = warp_shuffle_up(inclusive_input, 1, WarpSize); + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp index 4c9129a73..3fd1ebc80 100644 --- a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp @@ -31,30 +31,29 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Returns logical warp id of the last thread in thread's segment -template -ROCPRIM_DEVICE inline -unsigned int last_in_warp_segment(Flag flag) -{ - // Get flags (now every thread know where the flags are) - auto warp_flags = ::rocprim::ballot(flag); - using ballot_type = decltype(warp_flags); - - // In case of head flags change them to tail flags - if(HeadSegmented) + // Returns logical warp id of the last thread in thread's segment + template + ROCPRIM_DEVICE inline unsigned int last_in_warp_segment(Flag flag) { - warp_flags >>= 1; + // Get flags (now every thread know where the flags are) + auto warp_flags = ::rocprim::ballot(flag); + using ballot_type = decltype(warp_flags); + + // In case of head flags change them to tail flags + if(HeadSegmented) + { + warp_flags >>= 1; + } + const auto lane_id = ::rocprim::lane_id(); + // Zero bits from thread with lower lane id + warp_flags &= ballot_type(-1) ^ ((ballot_type(1) << lane_id) - 1U); + // Ignore bits from thread from other (previous) logical warps + warp_flags >>= (lane_id / WarpSize) * WarpSize; + // Make sure last item in logical warp is marked as a tail + warp_flags |= ballot_type(1) << (WarpSize - 1U); + // Calculate logical lane id of the last valid value in the segment + return ::__lastbit_u32_u64(warp_flags); } - const auto lane_id = ::rocprim::lane_id(); - // Zero bits from thread with lower lane id - warp_flags &= ballot_type(-1) ^ ((ballot_type(1) << lane_id) - 1U); - // Ignore bits from thread from other (previous) logical warps - warp_flags >>= (lane_id / WarpSize) * WarpSize; - // Make sure last item in logical warp is marked as a tail - warp_flags |= ballot_type(1) << (WarpSize - 1U); - // Calculate logical lane id of the last valid value in the segment - return ::__lastbit_u32_u64(warp_flags); -} } // end namespace detail diff --git a/rocprim/include/rocprim/warp/detail/warp_sort_shuffle.hpp b/rocprim/include/rocprim/warp/detail/warp_sort_shuffle.hpp index a104faa18..b87b20aa6 100644 --- a/rocprim/include/rocprim/warp/detail/warp_sort_shuffle.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_sort_shuffle.hpp @@ -26,149 +26,136 @@ #include "../../config.hpp" #include "../../detail/various.hpp" -#include "../../intrinsics.hpp" #include "../../functional.hpp" +#include "../../intrinsics.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -template< - class Key, - unsigned int WarpSize, - class Value -> -class warp_sort_shuffle -{ -private: - template - ROCPRIM_DEVICE inline - typename std::enable_if warp)>::type - swap(Key& k, Value& v, int mask, int dir, BinaryFunction compare_function) - { - (void) k; - (void) v; - (void) mask; - (void) dir; - (void) compare_function; - } - - template - ROCPRIM_DEVICE inline - typename std::enable_if<(WarpSize > warp)>::type - swap(Key& k, Value& v, int mask, int dir, BinaryFunction compare_function) + template + class warp_sort_shuffle { - Key k1 = warp_shuffle_xor(k, mask, WarpSize); - Value v1 = warp_shuffle_xor(v, mask, WarpSize); - const bool k_is_less_than_k1 = compare_function(k, k1); - if(k_is_less_than_k1 == dir) + private: + template + ROCPRIM_DEVICE inline typename std::enable_if warp)>::type + swap(Key& k, Value& v, int mask, int dir, BinaryFunction compare_function) { - k = k1; - v = v1; + (void)k; + (void)v; + (void)mask; + (void)dir; + (void)compare_function; } - } - template - ROCPRIM_DEVICE inline - typename std::enable_if warp)>::type - swap(Key& k, int mask, int dir, BinaryFunction compare_function) - { - (void) k; - (void) mask; - (void) dir; - (void) compare_function; - } - - template - ROCPRIM_DEVICE inline - typename std::enable_if<(WarpSize > warp)>::type - swap(Key& k, int mask, int dir, BinaryFunction compare_function) - { - Key k1 = warp_shuffle_xor(k, mask, WarpSize); - if(compare_function(k, k1) == dir) + template + ROCPRIM_DEVICE inline typename std::enable_if<(WarpSize > warp)>::type + swap(Key& k, Value& v, int mask, int dir, BinaryFunction compare_function) { - k = k1; + Key k1 = warp_shuffle_xor(k, mask, WarpSize); + Value v1 = warp_shuffle_xor(v, mask, WarpSize); + const bool k_is_less_than_k1 = compare_function(k, k1); + if(k_is_less_than_k1 == dir) + { + k = k1; + v = v1; + } } - } - template - ROCPRIM_DEVICE inline - void bitonic_sort(BinaryFunction compare_function, KeyValue&... kv) - { - static_assert( - sizeof...(KeyValue) < 3, - "KeyValue parameter pack can 1 or 2 elements (key, or key and value)" - ); - - unsigned int id = detail::logical_lane_id(); - swap<2, BinaryFunction>(kv..., 1, get_bit(id, 1) ^ get_bit(id, 0), compare_function); - - swap<4, BinaryFunction>(kv..., 2, get_bit(id, 2) ^ get_bit(id, 1), compare_function); - swap<4, BinaryFunction>(kv..., 1, get_bit(id, 2) ^ get_bit(id, 0), compare_function); - - swap<8, BinaryFunction>(kv..., 4, get_bit(id, 3) ^ get_bit(id, 2), compare_function); - swap<8, BinaryFunction>(kv..., 2, get_bit(id, 3) ^ get_bit(id, 1), compare_function); - swap<8, BinaryFunction>(kv..., 1, get_bit(id, 3) ^ get_bit(id, 0), compare_function); - - swap<16, BinaryFunction>(kv..., 8, get_bit(id, 4) ^ get_bit(id, 3), compare_function); - swap<16, BinaryFunction>(kv..., 4, get_bit(id, 4) ^ get_bit(id, 2), compare_function); - swap<16, BinaryFunction>(kv..., 2, get_bit(id, 4) ^ get_bit(id, 1), compare_function); - swap<16, BinaryFunction>(kv..., 1, get_bit(id, 4) ^ get_bit(id, 0), compare_function); - - swap<32, BinaryFunction>(kv..., 16, get_bit(id, 5) ^ get_bit(id, 4), compare_function); - swap<32, BinaryFunction>(kv..., 8, get_bit(id, 5) ^ get_bit(id, 3), compare_function); - swap<32, BinaryFunction>(kv..., 4, get_bit(id, 5) ^ get_bit(id, 2), compare_function); - swap<32, BinaryFunction>(kv..., 2, get_bit(id, 5) ^ get_bit(id, 1), compare_function); - swap<32, BinaryFunction>(kv..., 1, get_bit(id, 5) ^ get_bit(id, 0), compare_function); - - swap<32, BinaryFunction>(kv..., 32, get_bit(id, 5), compare_function); - swap<16, BinaryFunction>(kv..., 16, get_bit(id, 4), compare_function); - swap<8, BinaryFunction>(kv..., 8, get_bit(id, 3), compare_function); - swap<4, BinaryFunction>(kv..., 4, get_bit(id, 2), compare_function); - swap<2, BinaryFunction>(kv..., 2, get_bit(id, 1), compare_function); - swap<0, BinaryFunction>(kv..., 1, get_bit(id, 0), compare_function); - } - -public: - static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); - - using storage_type = ::rocprim::detail::empty_storage_type; - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_value, BinaryFunction compare_function) - { - // sort by value only - bitonic_sort(compare_function, thread_value); - } - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_value, storage_type& storage, - BinaryFunction compare_function) - { - (void) storage; - sort(thread_value, compare_function); - } - - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, Value& thread_value, - BinaryFunction compare_function) - { - bitonic_sort(compare_function, thread_key, thread_value); - } + template + ROCPRIM_DEVICE inline typename std::enable_if warp)>::type + swap(Key& k, int mask, int dir, BinaryFunction compare_function) + { + (void)k; + (void)mask; + (void)dir; + (void)compare_function; + } - template - ROCPRIM_DEVICE inline - void sort(Key& thread_key, Value& thread_value, - storage_type& storage, BinaryFunction compare_function) - { - (void) storage; - return sort(compare_function, thread_key, thread_value); - } -}; + template + ROCPRIM_DEVICE inline typename std::enable_if<(WarpSize > warp)>::type + swap(Key& k, int mask, int dir, BinaryFunction compare_function) + { + Key k1 = warp_shuffle_xor(k, mask, WarpSize); + if(compare_function(k, k1) == dir) + { + k = k1; + } + } + + template + ROCPRIM_DEVICE inline void bitonic_sort(BinaryFunction compare_function, KeyValue&... kv) + { + static_assert(sizeof...(KeyValue) < 3, + "KeyValue parameter pack can 1 or 2 elements (key, or key and value)"); + + unsigned int id = detail::logical_lane_id(); + swap<2, BinaryFunction>(kv..., 1, get_bit(id, 1) ^ get_bit(id, 0), compare_function); + + swap<4, BinaryFunction>(kv..., 2, get_bit(id, 2) ^ get_bit(id, 1), compare_function); + swap<4, BinaryFunction>(kv..., 1, get_bit(id, 2) ^ get_bit(id, 0), compare_function); + + swap<8, BinaryFunction>(kv..., 4, get_bit(id, 3) ^ get_bit(id, 2), compare_function); + swap<8, BinaryFunction>(kv..., 2, get_bit(id, 3) ^ get_bit(id, 1), compare_function); + swap<8, BinaryFunction>(kv..., 1, get_bit(id, 3) ^ get_bit(id, 0), compare_function); + + swap<16, BinaryFunction>(kv..., 8, get_bit(id, 4) ^ get_bit(id, 3), compare_function); + swap<16, BinaryFunction>(kv..., 4, get_bit(id, 4) ^ get_bit(id, 2), compare_function); + swap<16, BinaryFunction>(kv..., 2, get_bit(id, 4) ^ get_bit(id, 1), compare_function); + swap<16, BinaryFunction>(kv..., 1, get_bit(id, 4) ^ get_bit(id, 0), compare_function); + + swap<32, BinaryFunction>(kv..., 16, get_bit(id, 5) ^ get_bit(id, 4), compare_function); + swap<32, BinaryFunction>(kv..., 8, get_bit(id, 5) ^ get_bit(id, 3), compare_function); + swap<32, BinaryFunction>(kv..., 4, get_bit(id, 5) ^ get_bit(id, 2), compare_function); + swap<32, BinaryFunction>(kv..., 2, get_bit(id, 5) ^ get_bit(id, 1), compare_function); + swap<32, BinaryFunction>(kv..., 1, get_bit(id, 5) ^ get_bit(id, 0), compare_function); + + swap<32, BinaryFunction>(kv..., 32, get_bit(id, 5), compare_function); + swap<16, BinaryFunction>(kv..., 16, get_bit(id, 4), compare_function); + swap<8, BinaryFunction>(kv..., 8, get_bit(id, 3), compare_function); + swap<4, BinaryFunction>(kv..., 4, get_bit(id, 2), compare_function); + swap<2, BinaryFunction>(kv..., 2, get_bit(id, 1), compare_function); + swap<0, BinaryFunction>(kv..., 1, get_bit(id, 0), compare_function); + } + + public: + static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2"); + + using storage_type = ::rocprim::detail::empty_storage_type; + + template + ROCPRIM_DEVICE inline void sort(Key& thread_value, BinaryFunction compare_function) + { + // sort by value only + bitonic_sort(compare_function, thread_value); + } + + template + ROCPRIM_DEVICE inline void + sort(Key& thread_value, storage_type& storage, BinaryFunction compare_function) + { + (void)storage; + sort(thread_value, compare_function); + } + + template + ROCPRIM_DEVICE inline void + sort(Key& thread_key, Value& thread_value, BinaryFunction compare_function) + { + bitonic_sort(compare_function, thread_key, thread_value); + } + + template + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + storage_type& storage, + BinaryFunction compare_function) + { + (void)storage; + return sort(compare_function, thread_key, thread_value); + } + }; } // end namespace detail diff --git a/rocprim/include/rocprim/warp/warp_reduce.hpp b/rocprim/include/rocprim/warp/warp_reduce.hpp index a0d3828e7..698a4e2d5 100644 --- a/rocprim/include/rocprim/warp/warp_reduce.hpp +++ b/rocprim/include/rocprim/warp/warp_reduce.hpp @@ -26,8 +26,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" #include "detail/warp_reduce_crosslane.hpp" @@ -41,17 +41,17 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Select warp_reduce implementation based WarpSize -template -struct select_warp_reduce_impl -{ - typedef typename std::conditional< - // can we use crosslane (DPP or shuffle-based) implementation? - detail::is_warpsize_shuffleable::value, - detail::warp_reduce_crosslane, // yes - detail::warp_reduce_shared_mem // no - >::type type; -}; + // Select warp_reduce implementation based WarpSize + template + struct select_warp_reduce_impl + { + typedef typename std::conditional< + // can we use crosslane (DPP or shuffle-based) implementation? + detail::is_warpsize_shuffleable::value, + detail::warp_reduce_crosslane, // yes + detail::warp_reduce_shared_mem // no + >::type type; + }; } // end namespace detail @@ -106,11 +106,7 @@ struct select_warp_reduce_impl /// } /// \endcode /// \endparblock -template< - class T, - unsigned int WarpSize = warp_size(), - bool UseAllReduce = false -> +template class warp_reduce #ifndef DOXYGEN_SHOULD_SKIP_THIS : private detail::select_warp_reduce_impl::type @@ -176,12 +172,11 @@ class warp_reduce /// } /// \endcode /// \endparblock - template> - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - storage_type& storage, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + storage_type& storage, + BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, storage, reduce_op); } @@ -232,13 +227,12 @@ class warp_reduce /// } /// \endcode /// \endparblock - template> - ROCPRIM_DEVICE inline - void reduce(T input, - T& output, - int valid_items, - storage_type& storage, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void reduce(T input, + T& output, + int valid_items, + storage_type& storage, + BinaryFunction reduce_op = BinaryFunction()) { base_type::reduce(input, output, valid_items, storage, reduce_op); } @@ -261,13 +255,12 @@ class warp_reduce /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). - template> - ROCPRIM_DEVICE inline - void head_segmented_reduce(T input, - T& output, - Flag flag, - storage_type& storage, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void head_segmented_reduce(T input, + T& output, + Flag flag, + storage_type& storage, + BinaryFunction reduce_op = BinaryFunction()) { base_type::head_segmented_reduce(input, output, flag, storage, reduce_op); } @@ -290,13 +283,12 @@ class warp_reduce /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). - template> - ROCPRIM_DEVICE inline - void tail_segmented_reduce(T input, - T& output, - Flag flag, - storage_type& storage, - BinaryFunction reduce_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void tail_segmented_reduce(T input, + T& output, + Flag flag, + storage_type& storage, + BinaryFunction reduce_op = BinaryFunction()) { base_type::tail_segmented_reduce(input, output, flag, storage, reduce_op); } diff --git a/rocprim/include/rocprim/warp/warp_scan.hpp b/rocprim/include/rocprim/warp/warp_scan.hpp index b64e593ad..161302609 100644 --- a/rocprim/include/rocprim/warp/warp_scan.hpp +++ b/rocprim/include/rocprim/warp/warp_scan.hpp @@ -26,8 +26,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "../types.hpp" #include "detail/warp_scan_crosslane.hpp" @@ -41,17 +41,17 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Select warp_scan implementation based WarpSize -template -struct select_warp_scan_impl -{ - typedef typename std::conditional< - // can we use crosslane (DPP or shuffle-based) implementation? - detail::is_warpsize_shuffleable::value, - detail::warp_scan_crosslane, // yes - detail::warp_scan_shared_mem // no - >::type type; -}; + // Select warp_scan implementation based WarpSize + template + struct select_warp_scan_impl + { + typedef typename std::conditional< + // can we use crosslane (DPP or shuffle-based) implementation? + detail::is_warpsize_shuffleable::value, + detail::warp_scan_crosslane, // yes + detail::warp_scan_shared_mem // no + >::type type; + }; } // end namespace detail @@ -104,10 +104,7 @@ struct select_warp_scan_impl /// } /// \endcode /// \endparblock -template< - class T, - unsigned int WarpSize = warp_size() -> +template class warp_scan #ifndef DOXYGEN_SHOULD_SKIP_THIS : private detail::select_warp_scan_impl::type @@ -177,12 +174,11 @@ class warp_scan /// output values in the first logical warp will be {1, -2, -2, -4, ..., -32}, in the second: /// {33, -34, -34, -36, ..., -64} etc. /// \endparblock - template> - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T input, + T& output, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::inclusive_scan(input, output, storage, scan_op); } @@ -236,13 +232,12 @@ class warp_scan /// \p output values in the every logical warp will be {1, 2, 3, 4, ..., 64}. /// The \p reduction will be equal \p 64. /// \endparblock - template> - ROCPRIM_DEVICE inline - void inclusive_scan(T input, - T& output, - T& reduction, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void inclusive_scan(T input, + T& output, + T& reduction, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::inclusive_scan(input, output, reduction, storage, scan_op); } @@ -299,13 +294,12 @@ class warp_scan /// warp will be {100, 1, -2, -2, -4, ..., -30}, in the second: /// {100, 33, -34, -34, -36, ..., -62} etc. /// \endparblock - template> - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T input, + T& output, + T init, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::exclusive_scan(input, output, init, storage, scan_op); } @@ -363,14 +357,13 @@ class warp_scan /// {1, 1, ..., 1, 1}, then \p output values in every logical warp will be /// {10, 11, 12, 13, ..., 73}. The \p reduction will be 64. /// \endparblock - template> - ROCPRIM_DEVICE inline - void exclusive_scan(T input, - T& output, - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void exclusive_scan(T input, + T& output, + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::exclusive_scan(input, output, init, reduction, storage, scan_op); } @@ -433,14 +426,13 @@ class warp_scan /// logical warp will be {100, 1, -2, -2, -4, ..., -30}, in the second: /// {100, 33, -34, -34, -36, ..., -62} etc. /// \endparblock - template> - ROCPRIM_DEVICE inline - void scan(T input, - T& inclusive_output, - T& exclusive_output, - T init, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::scan(input, inclusive_output, exclusive_output, init, storage, scan_op); } @@ -503,20 +495,17 @@ class warp_scan /// {1, 2, 3, 4, ..., 63, 64}, and \p ex_output values in every logical warp will /// be {10, 11, 12, 13, ..., 73}. The \p reduction will be 64. /// \endparblock - template> - ROCPRIM_DEVICE inline - void scan(T input, - T& inclusive_output, - T& exclusive_output, - T init, - T& reduction, - storage_type& storage, - BinaryFunction scan_op = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void scan(T input, + T& inclusive_output, + T& exclusive_output, + T init, + T& reduction, + storage_type& storage, + BinaryFunction scan_op = BinaryFunction()) { base_type::scan( - input, inclusive_output, exclusive_output, init, reduction, - storage, scan_op - ); + input, inclusive_output, exclusive_output, init, reduction, storage, scan_op); } /// \brief Broadcasts value from one thread to all threads in logical warp. @@ -528,18 +517,15 @@ class warp_scan /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). - ROCPRIM_DEVICE inline - T broadcast(T input, - const unsigned int src_lane, - storage_type& storage) + ROCPRIM_DEVICE inline T broadcast(T input, const unsigned int src_lane, storage_type& storage) { return base_type::broadcast(input, src_lane, storage); } #ifndef DOXYGEN_SHOULD_SKIP_THIS protected: - ROCPRIM_DEVICE inline - void to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) + ROCPRIM_DEVICE inline void + to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) { return base_type::to_exclusive(inclusive_input, exclusive_output, storage); } diff --git a/rocprim/include/rocprim/warp/warp_sort.hpp b/rocprim/include/rocprim/warp/warp_sort.hpp index 94e16e6a2..43f3b9b5c 100644 --- a/rocprim/include/rocprim/warp/warp_sort.hpp +++ b/rocprim/include/rocprim/warp/warp_sort.hpp @@ -26,8 +26,8 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" #include "detail/warp_sort_shuffle.hpp" @@ -91,11 +91,7 @@ BEGIN_ROCPRIM_NAMESPACE /// } /// \endcode /// \endparblock -template< - class Key, - unsigned int WarpSize = warp_size(), - class Value = empty_type -> +template class warp_sort : detail::warp_sort_shuffle { typedef typename detail::warp_sort_shuffle base_type; @@ -124,10 +120,9 @@ class warp_sort : detail::warp_sort_shuffle /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + BinaryFunction compare_function = BinaryFunction()) { base_type::sort(thread_key, compare_function); } @@ -160,15 +155,12 @@ class warp_sort : detail::warp_sort_shuffle /// ... /// } /// \endcode - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - storage_type& storage, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + storage_type& storage, + BinaryFunction compare_function = BinaryFunction()) { - base_type::sort( - thread_key, storage, compare_function - ); + base_type::sort(thread_key, storage, compare_function); } /// \brief Warp sort by key for any data type. @@ -182,15 +174,12 @@ class warp_sort : detail::warp_sort_shuffle /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + BinaryFunction compare_function = BinaryFunction()) { - base_type::sort( - thread_key, thread_value, compare_function - ); + base_type::sort(thread_key, thread_value, compare_function); } /// \brief Warp sort by key for any data type using temporary storage. @@ -222,16 +211,13 @@ class warp_sort : detail::warp_sort_shuffle /// ... /// } /// \endcode - template> - ROCPRIM_DEVICE inline - void sort(Key& thread_key, - Value& thread_value, - storage_type& storage, - BinaryFunction compare_function = BinaryFunction()) + template > + ROCPRIM_DEVICE inline void sort(Key& thread_key, + Value& thread_value, + storage_type& storage, + BinaryFunction compare_function = BinaryFunction()) { - base_type::sort( - thread_key, thread_value, storage, compare_function - ); + base_type::sort(thread_key, thread_value, storage, compare_function); } }; diff --git a/test/extra/test_rocprim_package.cpp b/test/extra/test_rocprim_package.cpp index 0cea1fbfc..aaf54b003 100644 --- a/test/extra/test_rocprim_package.cpp +++ b/test/extra/test_rocprim_package.cpp @@ -18,43 +18,38 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +#include #include #include -#include #include #include -#define HIP_CHECK(condition) \ - { \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << error << std::endl; \ - exit(error); \ - } \ - } +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << error << std::endl; \ + exit(error); \ + } \ + } int main(int, char**) { using T = unsigned int; // host input/output - const size_t size = 1024 * 256; + const size_t size = 1024 * 256; std::vector input(size, 1); - T output = 0; + T output = 0; // device input/output - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host @@ -62,14 +57,10 @@ int main(int, char**) // Temporary storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, input.size() - ) - ); + rocprim::reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, input.size())); // Allocate temporary storage HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); @@ -77,29 +68,17 @@ int main(int, char**) // Run HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, input.size() - ) - ); + rocprim::reduce(d_temp_storage, temp_storage_size_bytes, d_input, d_output, input.size())); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - &output, d_output, - sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&output, d_output, sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); if(output != expected) { - std::cout - << "Failure: output (" << output - << ") != expected (" << expected << ")" - << std::endl; + std::cout << "Failure: output (" << output << ") != expected (" << expected << ")" + << std::endl; return 1; } return 0; diff --git a/test/rocprim/bounds_checking_iterator.hpp b/test/rocprim/bounds_checking_iterator.hpp index 1d97a66f3..35f30ed77 100644 --- a/test/rocprim/bounds_checking_iterator.hpp +++ b/test/rocprim/bounds_checking_iterator.hpp @@ -21,8 +21,8 @@ #ifndef TEST_BOUNDS_CHECKING_ITERATOR_HPP_ #define TEST_BOUNDS_CHECKING_ITERATOR_HPP_ -#include #include +#include // rocPRIM #include @@ -30,164 +30,160 @@ namespace test_utils { -// Output iterator checking out of bounds situations -template -class bounds_checking_iterator -{ -public: - // Iterator traits - using difference_type = std::ptrdiff_t; - using value_type = void; - using pointer = void; - using reference = T&; - - using iterator_category = std::random_access_iterator_tag; - - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator(T * ptr, T * start_ptr, bool * out_of_bounds_flag, size_t size) - : ptr_(ptr), start_ptr_(start_ptr), out_of_bounds_flag_(out_of_bounds_flag), size_(size) - { } - - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator(T * ptr, bool * out_of_bounds_flag, size_t size) - : bounds_checking_iterator(ptr, ptr, out_of_bounds_flag, size) - { } - - ROCPRIM_HOST_DEVICE inline - ~bounds_checking_iterator() = default; - - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator& operator++() - { - ptr_++; - return *this; - } + // Output iterator checking out of bounds situations + template + class bounds_checking_iterator + { + public: + // Iterator traits + using difference_type = std::ptrdiff_t; + using value_type = void; + using pointer = void; + using reference = T&; + + using iterator_category = std::random_access_iterator_tag; + + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator(T* ptr, + T* start_ptr, + bool* out_of_bounds_flag, + size_t size) + : ptr_(ptr) + , start_ptr_(start_ptr) + , out_of_bounds_flag_(out_of_bounds_flag) + , size_(size) + { + } - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator operator++(int) - { - bounds_checking_iterator old = *this; - ptr_++; - return old; - } + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator(T* ptr, + bool* out_of_bounds_flag, + size_t size) + : bounds_checking_iterator(ptr, ptr, out_of_bounds_flag, size) + { + } - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator& operator--() - { - ptr_--; - return *this; - } + ROCPRIM_HOST_DEVICE inline ~bounds_checking_iterator() = default; - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator operator--(int) - { - bounds_checking_iterator old = *this; - ptr_--; - return old; - } + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator& operator++() + { + ptr_++; + return *this; + } - ROCPRIM_HOST_DEVICE inline - reference operator*() const - { - if((ptr_ < start_ptr_) || (ptr_ >= start_ptr_ + size_)) + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator operator++(int) { - *out_of_bounds_flag_ = true; + bounds_checking_iterator old = *this; + ptr_++; + return old; } - return *ptr_; - } - ROCPRIM_HOST_DEVICE inline - reference operator[](difference_type n) const - { - if(((ptr_ + n) < start_ptr_) || ((ptr_ + n) >= start_ptr_ + size_)) + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator& operator--() { - *out_of_bounds_flag_ = true; + ptr_--; + return *this; } - return *(ptr_ + n); - } - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator operator+(difference_type distance) const - { - auto i = ptr_ + distance; - return bounds_checking_iterator(i, start_ptr_, out_of_bounds_flag_, size_); - } + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator operator--(int) + { + bounds_checking_iterator old = *this; + ptr_--; + return old; + } - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator& operator+=(difference_type distance) - { - ptr_ += distance; - return *this; - } + ROCPRIM_HOST_DEVICE inline reference operator*() const + { + if((ptr_ < start_ptr_) || (ptr_ >= start_ptr_ + size_)) + { + *out_of_bounds_flag_ = true; + } + return *ptr_; + } - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator operator-(difference_type distance) const - { - auto i = ptr_ - distance; - return bounds_checking_iterator(i, start_ptr_, out_of_bounds_flag_, size_); - } + ROCPRIM_HOST_DEVICE inline reference operator[](difference_type n) const + { + if(((ptr_ + n) < start_ptr_) || ((ptr_ + n) >= start_ptr_ + size_)) + { + *out_of_bounds_flag_ = true; + } + return *(ptr_ + n); + } - ROCPRIM_HOST_DEVICE inline - bounds_checking_iterator& operator-=(difference_type distance) - { - ptr_ -= distance; - return *this; - } + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator + operator+(difference_type distance) const + { + auto i = ptr_ + distance; + return bounds_checking_iterator(i, start_ptr_, out_of_bounds_flag_, size_); + } - ROCPRIM_HOST_DEVICE inline - difference_type operator-(bounds_checking_iterator other) const - { - return ptr_ - other.ptr_; - } + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator& operator+=(difference_type distance) + { + ptr_ += distance; + return *this; + } - ROCPRIM_HOST_DEVICE inline - bool operator==(bounds_checking_iterator other) const - { - return ptr_ == other.ptr_; - } + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator + operator-(difference_type distance) const + { + auto i = ptr_ - distance; + return bounds_checking_iterator(i, start_ptr_, out_of_bounds_flag_, size_); + } - ROCPRIM_HOST_DEVICE inline - bool operator!=(bounds_checking_iterator other) const - { - return ptr_ != other.ptr_; - } + ROCPRIM_HOST_DEVICE inline bounds_checking_iterator& operator-=(difference_type distance) + { + ptr_ -= distance; + return *this; + } -private: - T * ptr_; - T * start_ptr_; - bool * out_of_bounds_flag_; - size_t size_; -}; + ROCPRIM_HOST_DEVICE inline difference_type operator-(bounds_checking_iterator other) const + { + return ptr_ - other.ptr_; + } -class out_of_bounds_flag -{ -public: - out_of_bounds_flag() - { - hipMalloc(&device_pointer_, sizeof(bool)); - hipMemset(device_pointer_, 0, sizeof(bool)); - } + ROCPRIM_HOST_DEVICE inline bool operator==(bounds_checking_iterator other) const + { + return ptr_ == other.ptr_; + } - ~out_of_bounds_flag() - { - hipFree(device_pointer_); - } + ROCPRIM_HOST_DEVICE inline bool operator!=(bounds_checking_iterator other) const + { + return ptr_ != other.ptr_; + } - bool get() const - { - bool value; - hipMemcpy(&value, device_pointer_, sizeof(bool), hipMemcpyDeviceToHost); - return value; - } + private: + T* ptr_; + T* start_ptr_; + bool* out_of_bounds_flag_; + size_t size_; + }; - bool * device_pointer() const + class out_of_bounds_flag { - return device_pointer_; - } + public: + out_of_bounds_flag() + { + hipMalloc(&device_pointer_, sizeof(bool)); + hipMemset(device_pointer_, 0, sizeof(bool)); + } + + ~out_of_bounds_flag() + { + hipFree(device_pointer_); + } + + bool get() const + { + bool value; + hipMemcpy(&value, device_pointer_, sizeof(bool), hipMemcpyDeviceToHost); + return value; + } + + bool* device_pointer() const + { + return device_pointer_; + } -private: - bool * device_pointer_; -}; + private: + bool* device_pointer_; + }; } // end test_utils namespace diff --git a/test/rocprim/detail/get_rocprim_version.cpp b/test/rocprim/detail/get_rocprim_version.cpp index abbf78875..20a1c0a5f 100644 --- a/test/rocprim/detail/get_rocprim_version.cpp +++ b/test/rocprim/detail/get_rocprim_version.cpp @@ -22,8 +22,7 @@ #include "get_rocprim_version.hpp" -__global__ -void get_version_kernel(unsigned int * version) +__global__ void get_version_kernel(unsigned int* version) { *version = rocprim::version(); } @@ -32,25 +31,15 @@ unsigned int get_rocprim_version_on_device() { unsigned int version = 0; - unsigned int * d_version; + unsigned int* d_version; HIP_CHECK(hipMalloc(&d_version, sizeof(unsigned int))); HIP_CHECK(hipDeviceSynchronize()); - hipLaunchKernelGGL( - get_version_kernel, - dim3(1), dim3(1), 0, 0, - d_version - ); + hipLaunchKernelGGL(get_version_kernel, dim3(1), dim3(1), 0, 0, d_version); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - HIP_CHECK( - hipMemcpy( - &version, d_version, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&version, d_version, sizeof(unsigned int), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(d_version)); diff --git a/test/rocprim/detail/get_rocprim_version.hpp b/test/rocprim/detail/get_rocprim_version.hpp index e507a2b2f..a9eb7572c 100644 --- a/test/rocprim/detail/get_rocprim_version.hpp +++ b/test/rocprim/detail/get_rocprim_version.hpp @@ -23,20 +23,21 @@ #ifndef ROCPRIM_TEST_DETAIL_GET_ROCPRIM_VERSION_HPP_ #define ROCPRIM_TEST_DETAIL_GET_ROCPRIM_VERSION_HPP_ -#include #include +#include // rocPRIM API #include -#define HIP_CHECK(condition) \ -{ \ - hipError_t error = condition; \ - if(error != hipSuccess){ \ - std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ - exit(error); \ - } \ -} +#define HIP_CHECK(condition) \ + { \ + hipError_t error = condition; \ + if(error != hipSuccess) \ + { \ + std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ + exit(error); \ + } \ + } unsigned int get_rocprim_version_on_device(); diff --git a/test/rocprim/identity_iterator.hpp b/test/rocprim/identity_iterator.hpp index 85b6136d4..1627c0f42 100644 --- a/test/rocprim/identity_iterator.hpp +++ b/test/rocprim/identity_iterator.hpp @@ -21,8 +21,8 @@ #ifndef TEST_IDENTITY_ITERATOR_HPP_ #define TEST_IDENTITY_ITERATOR_HPP_ -#include #include +#include // rocPRIM #include @@ -30,136 +30,119 @@ namespace test_utils { -// Output iterator used in tests to check situtations when -// value_type of output iterator is void -template -class identity_iterator -{ -public: - // Iterator traits - using difference_type = std::ptrdiff_t; - using value_type = void; - using pointer = void; - using reference = T&; - - using iterator_category = std::random_access_iterator_tag; - - ROCPRIM_HOST_DEVICE inline - identity_iterator(T * ptr) - : ptr_(ptr) - { } - - ROCPRIM_HOST_DEVICE inline - ~identity_iterator() = default; - - ROCPRIM_HOST_DEVICE inline - identity_iterator& operator++() - { - ptr_++; - return *this; - } - - ROCPRIM_HOST_DEVICE inline - identity_iterator operator++(int) - { - identity_iterator old = *this; - ptr_++; - return old; - } - - ROCPRIM_HOST_DEVICE inline - identity_iterator& operator--() - { - ptr_--; - return *this; - } - - ROCPRIM_HOST_DEVICE inline - identity_iterator operator--(int) - { - identity_iterator old = *this; - ptr_--; - return old; - } - - ROCPRIM_HOST_DEVICE inline - reference operator*() const - { - return *ptr_; - } - - ROCPRIM_HOST_DEVICE inline - reference operator[](difference_type n) const + // Output iterator used in tests to check situtations when + // value_type of output iterator is void + template + class identity_iterator { - return *(ptr_ + n); - } - - ROCPRIM_HOST_DEVICE inline - identity_iterator operator+(difference_type distance) const - { - auto i = ptr_ + distance; - return identity_iterator(i); - } - - ROCPRIM_HOST_DEVICE inline - identity_iterator& operator+=(difference_type distance) - { - ptr_ += distance; - return *this; - } - - ROCPRIM_HOST_DEVICE inline - identity_iterator operator-(difference_type distance) const - { - auto i = ptr_ - distance; - return identity_iterator(i); - } - - ROCPRIM_HOST_DEVICE inline - identity_iterator& operator-=(difference_type distance) + public: + // Iterator traits + using difference_type = std::ptrdiff_t; + using value_type = void; + using pointer = void; + using reference = T&; + + using iterator_category = std::random_access_iterator_tag; + + ROCPRIM_HOST_DEVICE inline identity_iterator(T* ptr) + : ptr_(ptr) + { + } + + ROCPRIM_HOST_DEVICE inline ~identity_iterator() = default; + + ROCPRIM_HOST_DEVICE inline identity_iterator& operator++() + { + ptr_++; + return *this; + } + + ROCPRIM_HOST_DEVICE inline identity_iterator operator++(int) + { + identity_iterator old = *this; + ptr_++; + return old; + } + + ROCPRIM_HOST_DEVICE inline identity_iterator& operator--() + { + ptr_--; + return *this; + } + + ROCPRIM_HOST_DEVICE inline identity_iterator operator--(int) + { + identity_iterator old = *this; + ptr_--; + return old; + } + + ROCPRIM_HOST_DEVICE inline reference operator*() const + { + return *ptr_; + } + + ROCPRIM_HOST_DEVICE inline reference operator[](difference_type n) const + { + return *(ptr_ + n); + } + + ROCPRIM_HOST_DEVICE inline identity_iterator operator+(difference_type distance) const + { + auto i = ptr_ + distance; + return identity_iterator(i); + } + + ROCPRIM_HOST_DEVICE inline identity_iterator& operator+=(difference_type distance) + { + ptr_ += distance; + return *this; + } + + ROCPRIM_HOST_DEVICE inline identity_iterator operator-(difference_type distance) const + { + auto i = ptr_ - distance; + return identity_iterator(i); + } + + ROCPRIM_HOST_DEVICE inline identity_iterator& operator-=(difference_type distance) + { + ptr_ -= distance; + return *this; + } + + ROCPRIM_HOST_DEVICE inline difference_type operator-(identity_iterator other) const + { + return ptr_ - other.ptr_; + } + + ROCPRIM_HOST_DEVICE inline bool operator==(identity_iterator other) const + { + return ptr_ == other.ptr_; + } + + ROCPRIM_HOST_DEVICE inline bool operator!=(identity_iterator other) const + { + return ptr_ != other.ptr_; + } + + private: + T* ptr_; + }; + + template + inline auto wrap_in_identity_iterator(T* ptr) -> + typename std::enable_if>::type { - ptr_ -= distance; - return *this; + return identity_iterator(ptr); } - ROCPRIM_HOST_DEVICE inline - difference_type operator-(identity_iterator other) const + template + inline auto wrap_in_identity_iterator(T* ptr) -> typename std::enable_if::type { - return ptr_ - other.ptr_; + return ptr; } - ROCPRIM_HOST_DEVICE inline - bool operator==(identity_iterator other) const - { - return ptr_ == other.ptr_; - } - - ROCPRIM_HOST_DEVICE inline - bool operator!=(identity_iterator other) const - { - return ptr_ != other.ptr_; - } - -private: - T* ptr_; -}; - -template -inline -auto wrap_in_identity_iterator(T* ptr) - -> typename std::enable_if>::type -{ - return identity_iterator(ptr); -} - -template -inline -auto wrap_in_identity_iterator(T* ptr) - -> typename std::enable_if::type -{ - return ptr; -} - } // end test_utils namespace #endif // TEST_IDENTITY_ITERATOR_HPP_ diff --git a/test/rocprim/test_arg_index_iterator.cpp b/test/rocprim/test_arg_index_iterator.cpp index 5514618c7..22f3a7ce8 100644 --- a/test/rocprim/test_arg_index_iterator.cpp +++ b/test/rocprim/test_arg_index_iterator.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include +#include #include +#include // Google Test #include @@ -34,36 +34,34 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) // Params for tests -template +template struct RocprimArgIndexIteratorParams { using input_type = InputType; }; -template +template class RocprimArgIndexIteratorTests : public ::testing::Test { public: - using input_type = typename Params::input_type; + using input_type = typename Params::input_type; const bool debug_synchronous = false; }; -typedef ::testing::Types< - RocprimArgIndexIteratorParams, - RocprimArgIndexIteratorParams, - RocprimArgIndexIteratorParams, - RocprimArgIndexIteratorParams -> RocprimArgIndexIteratorTestsParams; +typedef ::testing::Types, + RocprimArgIndexIteratorParams, + RocprimArgIndexIteratorParams, + RocprimArgIndexIteratorParams> + RocprimArgIndexIteratorTestsParams; TYPED_TEST_CASE(RocprimArgIndexIteratorTests, RocprimArgIndexIteratorTestsParams); TYPED_TEST(RocprimArgIndexIteratorTests, Equal) { - using T = typename TestFixture::input_type; + using T = typename TestFixture::input_type; using Iterator = typename rocprim::arg_index_iterator; std::vector input = test_utils::get_random_data(5, 1, 200); @@ -87,14 +85,10 @@ TYPED_TEST(RocprimArgIndexIteratorTests, Equal) struct arg_min { - template< - class Key, - class Value - > - ROCPRIM_HOST_DEVICE inline - constexpr rocprim::key_value_pair - operator()(const rocprim::key_value_pair& a, - const rocprim::key_value_pair& b) const + template + ROCPRIM_HOST_DEVICE inline constexpr rocprim::key_value_pair + operator()(const rocprim::key_value_pair& a, + const rocprim::key_value_pair& b) const { return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; } @@ -102,10 +96,10 @@ struct arg_min TYPED_TEST(RocprimArgIndexIteratorTests, ReduceArgMinimum) { - using T = typename TestFixture::input_type; - using Iterator = typename rocprim::arg_index_iterator; - using key_value = typename Iterator::value_type; - using difference_type = typename Iterator::difference_type; + using T = typename TestFixture::input_type; + using Iterator = typename rocprim::arg_index_iterator; + using key_value = typename Iterator::value_type; + using difference_type = typename Iterator::difference_type; const bool debug_synchronous = false; const size_t size = 1024; @@ -113,42 +107,38 @@ TYPED_TEST(RocprimArgIndexIteratorTests, ReduceArgMinimum) hipStream_t stream = 0; // default // Generate data - std::vector input = test_utils::get_random_data(size, 1, 200); + std::vector input = test_utils::get_random_data(size, 1, 200); std::vector output(1); - T * d_input; - key_value * d_output; + T* d_input; + key_value* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(key_value))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); Iterator d_iter(d_input); - arg_min reduce_op; + arg_min reduce_op; const key_value max(std::numeric_limits::max(), std::numeric_limits::max()); // Calculate expected results on host - Iterator x(input.data()); + Iterator x(input.data()); key_value expected = std::accumulate(x, x + size, max, reduce_op); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_iter, d_output, max, input.size(), - reduce_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_iter, + d_output, + max, + input.size(), + reduce_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -158,29 +148,27 @@ TYPED_TEST(RocprimArgIndexIteratorTests, ReduceArgMinimum) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_iter, d_output, max, input.size(), - reduce_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_iter, + d_output, + max, + input.size(), + reduce_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(key_value), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, output.size() * sizeof(key_value), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected auto diff = std::max(std::abs(0.01f * expected.value), T(0.01f)); - if(std::is_integral::value) diff = 0; + if(std::is_integral::value) + diff = 0; ASSERT_EQ(output[0].key, expected.key); ASSERT_NEAR(output[0].value, expected.value, diff); diff --git a/test/rocprim/test_block_discontinuity.cpp b/test/rocprim/test_block_discontinuity.cpp index 414599d5c..bb22072fa 100644 --- a/test/rocprim/test_block_discontinuity.cpp +++ b/test/rocprim/test_block_discontinuity.cpp @@ -24,8 +24,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -34,34 +34,28 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; -template< - class T, - class Flag, - unsigned int BlockSize, - unsigned int ItemsPerThread, - class FlagOp -> +template struct params { - using type = T; - using flag_type = Flag; - static constexpr unsigned int block_size = BlockSize; + using type = T; + using flag_type = Flag; + static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; - using flag_op_type = FlagOp; + using flag_op_type = FlagOp; }; -template -class RocprimBlockDiscontinuity : public ::testing::Test { +template +class RocprimBlockDiscontinuity : public ::testing::Test +{ public: using params = Params; }; -template +template struct custom_flag_op1 { ROCPRIM_HOST_DEVICE @@ -73,25 +67,24 @@ struct custom_flag_op1 struct custom_flag_op2 { - template - ROCPRIM_HOST_DEVICE - bool operator()(const T& a, const T& b) const + template + ROCPRIM_HOST_DEVICE bool operator()(const T& a, const T& b) const { return (a - b > 5); } }; // Host (CPU) implementaions of the wrapping function that allows to pass 3 args -template +template typename std::enable_if::value, bool>::type -apply(FlagOp flag_op, const T& a, const T& b, unsigned int b_index) + apply(FlagOp flag_op, const T& a, const T& b, unsigned int b_index) { return flag_op(a, b, b_index); } -template +template typename std::enable_if::value, bool>::type -apply(FlagOp flag_op, const T& a, const T& b, unsigned int) + apply(FlagOp flag_op, const T& a, const T& b, unsigned int) { return flag_op(a, b); } @@ -102,7 +95,9 @@ TEST(RocprimBlockDiscontinuity, Traits) ASSERT_FALSE((rp::detail::with_b_index_arg::value)); ASSERT_TRUE((rp::detail::with_b_index_arg>::value)); - auto f1 = [](const int& a, const int& b, unsigned int b_index) { return (a == b) || (b_index % 10 == 0); }; + auto f1 = [](const int& a, const int& b, unsigned int b_index) { + return (a == b) || (b_index % 10 == 0); + }; auto f2 = [](const int& a, const int& b) { return (a == b); }; ASSERT_TRUE((rp::detail::with_b_index_arg::value)); ASSERT_FALSE((rp::detail::with_b_index_arg::value)); @@ -113,55 +108,52 @@ TEST(RocprimBlockDiscontinuity, Traits) ASSERT_FALSE((rp::detail::with_b_index_arg::value)); } -using custom_int2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< // Power of 2 BlockSize - params >, - params >, - params >, - params >, - params >, + params>, + params>, + params>, + params>, + params>, params, // Non-power of 2 BlockSize - params >, - params >, - params >, - params >, - params >, + params>, + params>, + params>, + params>, + params>, // Power of 2 BlockSize and ItemsPerThread > 1 params, - params >, + params>, params, - params >, + params>, params, // Non-power of 2 BlockSize and ItemsPerThread > 1 params, - params >, - params >, - params >, - params -> Params; + params>, + params>, + params>, + params> + Params; TYPED_TEST_CASE(RocprimBlockDiscontinuity, Params); -template< - class Type, - class FlagType, - class FlagOpType, - unsigned int BlockSize, - unsigned int ItemsPerThread -> -__global__ -void flag_heads_kernel(Type* device_input, long long* device_heads) +template +__global__ void flag_heads_kernel(Type* device_input, long long* device_heads) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; Type input[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -188,17 +180,16 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeads) // std::vector is a special case that will cause an error in hipMemcpy // http://en.cppreference.com/w/cpp/container/vector_bool using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type - >::type; - using flag_type = typename TestFixture::params::flag_type; - using flag_op_type = typename TestFixture::params::flag_op_type; - constexpr size_t block_size = TestFixture::params::block_size; + std::is_same::value, + int, + typename TestFixture::params::flag_type>::type; + using flag_type = typename TestFixture::params::flag_type; + using flag_op_type = typename TestFixture::params::flag_op_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 2048; - constexpr size_t grid_size = size / items_per_block; + constexpr size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 2048; + constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -207,12 +198,12 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeads) } // Generate data - std::vector input = test_utils::get_random_data(size, 0, 10); + std::vector input = test_utils::get_random_data(size, 0, 10); std::vector heads(size); // Calculate expected results on host std::vector expected_heads(size); - flag_op_type flag_op; + flag_op_type flag_op; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t ii = 0; ii < items_per_block; ii++) @@ -220,9 +211,8 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeads) const size_t i = bi * items_per_block + ii; if(ii == 0) { - expected_heads[i] = bi % 2 == 1 - ? apply(flag_op, input[i - 1], input[i], ii) - : flag_type(true); + expected_heads[i] + = bi % 2 == 1 ? apply(flag_op, input[i - 1], input[i], ii) : flag_type(true); } else { @@ -233,40 +223,33 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeads) // Preparing Device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); long long* device_heads; - HIP_CHECK(hipMalloc(&device_heads, heads.size() * sizeof(typename decltype(heads)::value_type))); + HIP_CHECK( + hipMalloc(&device_heads, heads.size() * sizeof(typename decltype(heads)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - flag_heads_kernel< - type, flag_type, flag_op_type, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_heads - ); + flag_heads_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_heads); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - heads.data(), device_heads, - heads.size() * sizeof(typename decltype(heads)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(heads.data(), + device_heads, + heads.size() * sizeof(typename decltype(heads)::value_type), + hipMemcpyDeviceToHost)); // Validating results for(size_t i = 0; i < size; i++) @@ -278,19 +261,16 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeads) HIP_CHECK(hipFree(device_heads)); } -template< - class Type, - class FlagType, - class FlagOpType, - unsigned int BlockSize, - unsigned int ItemsPerThread -> -__global__ -void flag_tails_kernel(Type* device_input, long long* device_tails) +template +__global__ void flag_tails_kernel(Type* device_input, long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; Type input[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -317,17 +297,16 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagTails) // std::vector is a special case that will cause an error in hipMemcpy // http://en.cppreference.com/w/cpp/container/vector_bool using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type - >::type; - using flag_type = typename TestFixture::params::flag_type; - using flag_op_type = typename TestFixture::params::flag_op_type; - constexpr size_t block_size = TestFixture::params::block_size; + std::is_same::value, + int, + typename TestFixture::params::flag_type>::type; + using flag_type = typename TestFixture::params::flag_type; + using flag_op_type = typename TestFixture::params::flag_op_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 2048; - constexpr size_t grid_size = size / items_per_block; + constexpr size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 2048; + constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -336,12 +315,12 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagTails) } // Generate data - std::vector input = test_utils::get_random_data(size, 0, 10); + std::vector input = test_utils::get_random_data(size, 0, 10); std::vector tails(size); // Calculate expected results on host std::vector expected_tails(size); - flag_op_type flag_op; + flag_op_type flag_op; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t ii = 0; ii < items_per_block; ii++) @@ -349,9 +328,8 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagTails) const size_t i = bi * items_per_block + ii; if(ii == items_per_block - 1) { - expected_tails[i] = bi % 2 == 0 - ? apply(flag_op, input[i], input[i + 1], ii + 1) - : flag_type(true); + expected_tails[i] = bi % 2 == 0 ? apply(flag_op, input[i], input[i + 1], ii + 1) + : flag_type(true); } else { @@ -362,40 +340,33 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagTails) // Preparing Device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); long long* device_tails; - HIP_CHECK(hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); + HIP_CHECK( + hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - flag_tails_kernel< - type, flag_type, flag_op_type, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_tails - ); + flag_tails_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_tails); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - tails.data(), device_tails, - tails.size() * sizeof(typename decltype(tails)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(tails.data(), + device_tails, + tails.size() * sizeof(typename decltype(tails)::value_type), + hipMemcpyDeviceToHost)); // Validating results for(size_t i = 0; i < size; i++) @@ -407,19 +378,18 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagTails) HIP_CHECK(hipFree(device_tails)); } -template< - class Type, - class FlagType, - class FlagOpType, - unsigned int BlockSize, - unsigned int ItemsPerThread -> -__global__ -void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, long long* device_tails) +template +__global__ void flag_heads_and_tails_kernel(Type* device_input, + long long* device_heads, + long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = hipThreadIdx_x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; Type input[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -431,18 +401,25 @@ void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, lo if(hipBlockIdx_x % 4 == 0) { const Type tile_successor_item = device_input[block_offset + items_per_block]; - bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, FlagOpType()); + bdiscontinuity.flag_heads_and_tails( + head_flags, tail_flags, tile_successor_item, input, FlagOpType()); } else if(hipBlockIdx_x % 4 == 1) { const Type tile_predecessor_item = device_input[block_offset - 1]; - const Type tile_successor_item = device_input[block_offset + items_per_block]; - bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input, FlagOpType()); + const Type tile_successor_item = device_input[block_offset + items_per_block]; + bdiscontinuity.flag_heads_and_tails(head_flags, + tile_predecessor_item, + tail_flags, + tile_successor_item, + input, + FlagOpType()); } else if(hipBlockIdx_x % 4 == 2) { const Type tile_predecessor_item = device_input[block_offset - 1]; - bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, FlagOpType()); + bdiscontinuity.flag_heads_and_tails( + head_flags, tile_predecessor_item, tail_flags, input, FlagOpType()); } else if(hipBlockIdx_x % 4 == 3) { @@ -459,17 +436,16 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeadsAndTails) // std::vector is a special case that will cause an error in hipMemcpy // http://en.cppreference.com/w/cpp/container/vector_bool using stored_flag_type = typename std::conditional< - std::is_same::value, - int, - typename TestFixture::params::flag_type - >::type; - using flag_type = typename TestFixture::params::flag_type; - using flag_op_type = typename TestFixture::params::flag_op_type; - constexpr size_t block_size = TestFixture::params::block_size; + std::is_same::value, + int, + typename TestFixture::params::flag_type>::type; + using flag_type = typename TestFixture::params::flag_type; + using flag_op_type = typename TestFixture::params::flag_op_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 2048; - constexpr size_t grid_size = size / items_per_block; + constexpr size_t items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 2048; + constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -478,14 +454,14 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeadsAndTails) } // Generate data - std::vector input = test_utils::get_random_data(size, 0, 10); + std::vector input = test_utils::get_random_data(size, 0, 10); std::vector heads(size); std::vector tails(size); // Calculate expected results on host std::vector expected_heads(size); std::vector expected_tails(size); - flag_op_type flag_op; + flag_op_type flag_op; for(size_t bi = 0; bi < size / items_per_block; bi++) { for(size_t ii = 0; ii < items_per_block; ii++) @@ -494,8 +470,8 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeadsAndTails) if(ii == 0) { expected_heads[i] = (bi % 4 == 1 || bi % 4 == 2) - ? apply(flag_op, input[i - 1], input[i], ii) - : flag_type(true); + ? apply(flag_op, input[i - 1], input[i], ii) + : flag_type(true); } else { @@ -504,8 +480,8 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeadsAndTails) if(ii == items_per_block - 1) { expected_tails[i] = (bi % 4 == 0 || bi % 4 == 1) - ? apply(flag_op, input[i], input[i + 1], ii + 1) - : flag_type(true); + ? apply(flag_op, input[i], input[i + 1], ii + 1) + : flag_type(true); } else { @@ -516,50 +492,44 @@ TYPED_TEST(RocprimBlockDiscontinuity, FlagHeadsAndTails) // Preparing Device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); long long* device_heads; - HIP_CHECK(hipMalloc(&device_heads, tails.size() * sizeof(typename decltype(heads)::value_type))); + HIP_CHECK( + hipMalloc(&device_heads, tails.size() * sizeof(typename decltype(heads)::value_type))); long long* device_tails; - HIP_CHECK(hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); + HIP_CHECK( + hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - flag_heads_and_tails_kernel< - type, flag_type, flag_op_type, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_heads, device_tails - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(flag_heads_and_tails_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_heads, + device_tails); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - heads.data(), device_heads, - heads.size() * sizeof(typename decltype(heads)::value_type), - hipMemcpyDeviceToHost - ) - ); - - HIP_CHECK( - hipMemcpy( - tails.data(), device_tails, - tails.size() * sizeof(typename decltype(tails)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(heads.data(), + device_heads, + heads.size() * sizeof(typename decltype(heads)::value_type), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(tails.data(), + device_tails, + tails.size() * sizeof(typename decltype(tails)::value_type), + hipMemcpyDeviceToHost)); // Validating results for(size_t i = 0; i < size; i++) diff --git a/test/rocprim/test_block_exchange.cpp b/test/rocprim/test_block_exchange.cpp index 6f28f4820..2940c9559 100644 --- a/test/rocprim/test_block_exchange.cpp +++ b/test/rocprim/test_block_exchange.cpp @@ -22,9 +22,9 @@ #include #include -#include #include #include +#include // Google Test #include @@ -33,33 +33,28 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; -template< - class T, - class U, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template struct params { - using type = T; - using output_type = U; - static constexpr unsigned int block_size = BlockSize; + using type = T; + using output_type = U; + static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; }; -template -class RocprimBlockExchangeTests : public ::testing::Test { +template +class RocprimBlockExchangeTests : public ::testing::Test +{ public: using params = Params; }; -using custom_short2 = test_utils::custom_test_type; -using custom_int2 = test_utils::custom_test_type; +using custom_short2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< @@ -88,25 +83,19 @@ typedef ::testing::Types< params, params, params, - params -> Params; + params> + Params; TYPED_TEST_CASE(RocprimBlockExchangeTests, Params); -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> -__global__ -void blocked_to_striped_kernel(Type* device_input, OutputType* device_output) +template +__global__ void blocked_to_striped_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -118,11 +107,11 @@ void blocked_to_striped_kernel(Type* device_input, OutputType* device_output) TYPED_TEST(RocprimBlockExchangeTests, BlockedToStriped) { - using type = typename TestFixture::params::type; - using output_type = typename TestFixture::params::output_type; - constexpr size_t block_size = TestFixture::params::block_size; + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -131,7 +120,7 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToStriped) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, output_type(0)); @@ -145,46 +134,44 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToStriped) for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block; - const size_t i0 = offset + ti * items_per_thread + ii; - const size_t i1 = offset + ii * block_size + ti; - input[i1] = values[i1]; - expected[i0] = values[i1]; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + ii * block_size + ti; + input[i1] = values[i1]; + expected[i0] = values[i1]; } } } // Preparing device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(blocked_to_striped_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); + HIP_KERNEL_NAME( + blocked_to_striped_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -192,20 +179,14 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToStriped) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> -__global__ -void striped_to_blocked_kernel(Type* device_input, OutputType* device_output) +template +__global__ void striped_to_blocked_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -217,11 +198,11 @@ void striped_to_blocked_kernel(Type* device_input, OutputType* device_output) TYPED_TEST(RocprimBlockExchangeTests, StripedToBlocked) { - using type = typename TestFixture::params::type; - using output_type = typename TestFixture::params::output_type; - constexpr size_t block_size = TestFixture::params::block_size; + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -230,7 +211,7 @@ TYPED_TEST(RocprimBlockExchangeTests, StripedToBlocked) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, output_type(0)); @@ -244,46 +225,44 @@ TYPED_TEST(RocprimBlockExchangeTests, StripedToBlocked) for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block; - const size_t i0 = offset + ti * items_per_thread + ii; - const size_t i1 = offset + ii * block_size + ti; - input[i0] = values[i1]; - expected[i1] = values[i1]; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + ii * block_size + ti; + input[i0] = values[i1]; + expected[i1] = values[i1]; } } } // Preparing device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(striped_to_blocked_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); + HIP_KERNEL_NAME( + striped_to_blocked_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -291,20 +270,14 @@ TYPED_TEST(RocprimBlockExchangeTests, StripedToBlocked) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> -__global__ -void blocked_to_warp_striped_kernel(Type* device_input, OutputType* device_output) +template +__global__ void blocked_to_warp_striped_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -316,11 +289,11 @@ void blocked_to_warp_striped_kernel(Type* device_input, OutputType* device_outpu TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) { - using type = typename TestFixture::params::type; - using output_type = typename TestFixture::params::output_type; - constexpr size_t block_size = TestFixture::params::block_size; + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -329,13 +302,13 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, output_type(0)); - constexpr size_t warp_size = - ::rocprim::detail::get_min_warp_size(block_size, size_t(::rocprim::warp_size())); - constexpr size_t warps_no = (block_size + warp_size - 1) / warp_size; + constexpr size_t warp_size + = ::rocprim::detail::get_min_warp_size(block_size, size_t(::rocprim::warp_size())); + constexpr size_t warps_no = (block_size + warp_size - 1) / warp_size; constexpr size_t items_per_warp = warp_size * items_per_thread; // Calculate input and expected results on host @@ -345,18 +318,19 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) { for(size_t wi = 0; wi < warps_no; wi++) { - const size_t current_warp_size = wi == warps_no - 1 - ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) - : warp_size; + const size_t current_warp_size + = wi == warps_no - 1 + ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) + : warp_size; for(size_t li = 0; li < current_warp_size; li++) { for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block + wi * items_per_warp; - const size_t i0 = offset + li * items_per_thread + ii; - const size_t i1 = offset + ii * current_warp_size + li; - input[i1] = values[i1]; - expected[i0] = values[i1]; + const size_t i0 = offset + li * items_per_thread + ii; + const size_t i1 = offset + ii * current_warp_size + li; + input[i1] = values[i1]; + expected[i0] = values[i1]; } } } @@ -364,38 +338,34 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) // Preparing device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(blocked_to_warp_striped_kernel< - type, output_type, items_per_block, items_per_thread - >), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); + HIP_KERNEL_NAME( + blocked_to_warp_striped_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -403,20 +373,14 @@ TYPED_TEST(RocprimBlockExchangeTests, BlockedToWarpStriped) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> -__global__ -void warp_striped_to_blocked_kernel(Type* device_input, OutputType* device_output) +template +__global__ void warp_striped_to_blocked_kernel(Type* device_input, OutputType* device_output) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; + Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -428,11 +392,11 @@ void warp_striped_to_blocked_kernel(Type* device_input, OutputType* device_outpu TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) { - using type = typename TestFixture::params::type; - using output_type = typename TestFixture::params::output_type; - constexpr size_t block_size = TestFixture::params::block_size; + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -441,13 +405,13 @@ TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); + std::vector input(size); std::vector expected(size); std::vector output(size, output_type(0)); - constexpr size_t warp_size = - ::rocprim::detail::get_min_warp_size(block_size, size_t(::rocprim::warp_size())); - constexpr size_t warps_no = (block_size + warp_size - 1) / warp_size; + constexpr size_t warp_size + = ::rocprim::detail::get_min_warp_size(block_size, size_t(::rocprim::warp_size())); + constexpr size_t warps_no = (block_size + warp_size - 1) / warp_size; constexpr size_t items_per_warp = warp_size * items_per_thread; // Calculate input and expected results on host @@ -457,18 +421,19 @@ TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) { for(size_t wi = 0; wi < warps_no; wi++) { - const size_t current_warp_size = wi == warps_no - 1 - ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) - : warp_size; + const size_t current_warp_size + = wi == warps_no - 1 + ? (block_size % warp_size != 0 ? block_size % warp_size : warp_size) + : warp_size; for(size_t li = 0; li < current_warp_size; li++) { for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block + wi * items_per_warp; - const size_t i0 = offset + li * items_per_thread + ii; - const size_t i1 = offset + ii * current_warp_size + li; - input[i0] = values[i1]; - expected[i1] = values[i1]; + const size_t i0 = offset + li * items_per_thread + ii; + const size_t i1 = offset + ii * current_warp_size + li; + input[i0] = values[i1]; + expected[i1] = values[i1]; } } } @@ -476,36 +441,34 @@ TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) // Preparing device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_striped_to_blocked_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); + HIP_KERNEL_NAME( + warp_striped_to_blocked_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -513,21 +476,17 @@ TYPED_TEST(RocprimBlockExchangeTests, WarpStripedToBlocked) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> -__global__ -void scatter_to_blocked_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks) +template +__global__ void scatter_to_blocked_kernel(Type* device_input, + OutputType* device_output, + unsigned int* device_ranks) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; - OutputType output[ItemsPerThread]; + Type input[ItemsPerThread]; + OutputType output[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); rp::block_load_direct_blocked(lid, device_ranks + block_offset, ranks); @@ -540,11 +499,11 @@ void scatter_to_blocked_kernel(Type* device_input, OutputType* device_output, un TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) { - using type = typename TestFixture::params::type; - using output_type = typename TestFixture::params::output_type; - constexpr size_t block_size = TestFixture::params::block_size; + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -553,9 +512,9 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); - std::vector expected(size); - std::vector output(size, output_type(0)); + std::vector input(size); + std::vector expected(size); + std::vector output(size, output_type(0)); std::vector ranks(size); // Calculate input and expected results on host @@ -563,7 +522,8 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); - std::shuffle(block_ranks, block_ranks + items_per_block, std::mt19937{std::random_device{}()}); + std::shuffle( + block_ranks, block_ranks + items_per_block, std::mt19937 {std::random_device {}()}); } std::vector values(size); std::iota(values.begin(), values.end(), 0); @@ -574,56 +534,51 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block; - const size_t i0 = offset + ti * items_per_thread + ii; - const size_t i1 = offset + ranks[i0]; - input[i0] = values[i0]; - expected[i1] = values[i0]; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 = offset + ranks[i0]; + input[i0] = values[i0]; + expected[i1] = values[i0]; } } } // Preparing device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); unsigned int* device_ranks; - HIP_CHECK(hipMalloc(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type))); HIP_CHECK( - hipMemcpy( - device_ranks, ranks.data(), - ranks.size() * sizeof(unsigned int), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy( + device_ranks, ranks.data(), ranks.size() * sizeof(unsigned int), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(scatter_to_blocked_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, device_ranks - ); + HIP_KERNEL_NAME( + scatter_to_blocked_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_ranks); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -632,21 +587,17 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToBlocked) HIP_CHECK(hipFree(device_ranks)); } -template< - class Type, - class OutputType, - unsigned int ItemsPerBlock, - unsigned int ItemsPerThread -> -__global__ -void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks) +template +__global__ void scatter_to_striped_kernel(Type* device_input, + OutputType* device_output, + unsigned int* device_ranks) { - constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; - Type input[ItemsPerThread]; - OutputType output[ItemsPerThread]; + Type input[ItemsPerThread]; + OutputType output[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_input + block_offset, input); rp::block_load_direct_blocked(lid, device_ranks + block_offset, ranks); @@ -659,11 +610,11 @@ void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, un TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) { - using type = typename TestFixture::params::type; - using output_type = typename TestFixture::params::output_type; - constexpr size_t block_size = TestFixture::params::block_size; + using type = typename TestFixture::params::type; + using output_type = typename TestFixture::params::output_type; + constexpr size_t block_size = TestFixture::params::block_size; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr size_t items_per_block = block_size * items_per_thread; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -672,9 +623,9 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) const size_t size = items_per_block * 113; // Generate data - std::vector input(size); - std::vector expected(size); - std::vector output(size, output_type(0)); + std::vector input(size); + std::vector expected(size); + std::vector output(size, output_type(0)); std::vector ranks(size); // Calculate input and expected results on host @@ -682,7 +633,8 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); - std::shuffle(block_ranks, block_ranks + items_per_block, std::mt19937{std::random_device{}()}); + std::shuffle( + block_ranks, block_ranks + items_per_block, std::mt19937 {std::random_device {}()}); } std::vector values(size); std::iota(values.begin(), values.end(), 0); @@ -693,11 +645,10 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) for(size_t ii = 0; ii < items_per_thread; ii++) { const size_t offset = bi * items_per_block; - const size_t i0 = offset + ti * items_per_thread + ii; - const size_t i1 = offset - + ranks[i0] % block_size * items_per_thread - + ranks[i0] / block_size; - input[i0] = values[i0]; + const size_t i0 = offset + ti * items_per_thread + ii; + const size_t i1 + = offset + ranks[i0] % block_size * items_per_thread + ranks[i0] / block_size; + input[i0] = values[i0]; expected[i1] = values[i0]; } } @@ -705,46 +656,41 @@ TYPED_TEST(RocprimBlockExchangeTests, ScatterToStriped) // Preparing device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); output_type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); unsigned int* device_ranks; - HIP_CHECK(hipMalloc(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(type), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_ranks, ranks.size() * sizeof(typename decltype(ranks)::value_type))); HIP_CHECK( - hipMemcpy( - device_ranks, ranks.data(), - ranks.size() * sizeof(unsigned int), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(type), hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy( + device_ranks, ranks.data(), ranks.size() * sizeof(unsigned int), hipMemcpyHostToDevice)); // Running kernel constexpr unsigned int grid_size = (size / items_per_block); hipLaunchKernelGGL( - HIP_KERNEL_NAME(scatter_to_striped_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, device_ranks - ); + HIP_KERNEL_NAME( + scatter_to_striped_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_ranks); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Reading results - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); diff --git a/test/rocprim/test_block_histogram.cpp b/test/rocprim/test_block_histogram.cpp index d73f8afe7..80f95a2d9 100644 --- a/test/rocprim/test_block_histogram.cpp +++ b/test/rocprim/test_block_histogram.cpp @@ -35,93 +35,134 @@ namespace rp = rocprim; // Params for tests -template< - class T, - class BinType, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U, - unsigned int BinSize = BlockSize, - rocprim::block_histogram_algorithm Algorithm = rocprim::block_histogram_algorithm::using_atomic -> +template struct params { - using type = T; - using bin_type = BinType; - static constexpr rocprim::block_histogram_algorithm algorithm = Algorithm; - static constexpr unsigned int block_size = BlockSize; - static constexpr unsigned int items_per_thread = ItemsPerThread; - static constexpr unsigned int bin_size = BinSize; + using type = T; + using bin_type = BinType; + static constexpr rocprim::block_histogram_algorithm algorithm = Algorithm; + static constexpr unsigned int block_size = BlockSize; + static constexpr unsigned int items_per_thread = ItemsPerThread; + static constexpr unsigned int bin_size = BinSize; }; -template +template class RocprimBlockHistogramInputArrayTests : public ::testing::Test { public: - using type = typename Params::type; - using bin_type = typename Params::bin_type; - static constexpr unsigned int block_size = Params::block_size; - static constexpr rocprim::block_histogram_algorithm algorithm = Params::algorithm; - static constexpr unsigned int items_per_thread = Params::items_per_thread; - static constexpr unsigned int bin_size = Params::bin_size; + using type = typename Params::type; + using bin_type = typename Params::bin_type; + static constexpr unsigned int block_size = Params::block_size; + static constexpr rocprim::block_histogram_algorithm algorithm = Params::algorithm; + static constexpr unsigned int items_per_thread = Params::items_per_thread; + static constexpr unsigned int bin_size = Params::bin_size; }; typedef ::testing::Types< // ----------------------------------------------------------------------- // rocprim::block_histogram_algorithm::using_atomic // ----------------------------------------------------------------------- - params, - params, - params, - params, + params, + params, + params, + params, params, - params, - params, - params, - params, - params, - params, - params, - params, + params, + params, + params, + params, + params, + params, + params, + params, params, // ----------------------------------------------------------------------- // rocprim::block_histogram_algorithm::using_sort // ----------------------------------------------------------------------- - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params -> InputArrayTestParams; + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params> + InputArrayTestParams; TYPED_TEST_CASE(RocprimBlockHistogramInputArrayTests, InputArrayTestParams); -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - unsigned int BinSize, - rocprim::block_histogram_algorithm Algorithm, - class T, - class BinType -> -__global__ -void histogram_kernel(T* device_output, T* device_output_bin) +template +__global__ void histogram_kernel(T* device_output, T* device_output_bin) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - unsigned int global_offset = hipBlockIdx_x * BinSize; + unsigned int global_offset = hipBlockIdx_x * BinSize; __shared__ BinType hist[BinSize]; // load T in_out[ItemsPerThread]; @@ -133,8 +174,8 @@ void histogram_kernel(T* device_output, T* device_output_bin) rp::block_histogram bhist; bhist.histogram(in_out, hist); - #pragma unroll - for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) +#pragma unroll + for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + hipThreadIdx_x < BinSize) { @@ -146,12 +187,12 @@ void histogram_kernel(T* device_output, T* device_output_bin) TYPED_TEST(RocprimBlockHistogramInputArrayTests, Histogram) { - using T = typename TestFixture::type; - using BinType = typename TestFixture::bin_type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + using BinType = typename TestFixture::bin_type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; - constexpr size_t bin = TestFixture::bin_size; + constexpr size_t bin = TestFixture::bin_size; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -160,9 +201,9 @@ TYPED_TEST(RocprimBlockHistogramInputArrayTests, Histogram) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t bin_sizes = bin * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t bin_sizes = bin * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 0, bin - 1); @@ -176,7 +217,7 @@ TYPED_TEST(RocprimBlockHistogramInputArrayTests, Histogram) for(size_t j = 0; j < items_per_block; j++) { auto bin_idx = i * bin; - auto idx = i * items_per_block + j; + auto idx = i * items_per_block + j; expected_bin[bin_idx + static_cast(output[idx])]++; } } @@ -188,42 +229,32 @@ TYPED_TEST(RocprimBlockHistogramInputArrayTests, Histogram) HIP_CHECK(hipMalloc(&device_output_bin, output_bin.size() * sizeof(T))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK( - hipMemcpy( - device_output_bin, output_bin.data(), - output_bin.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_output_bin, + output_bin.data(), + output_bin.size() * sizeof(T), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(histogram_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_bin - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bin); // Reading results back - HIP_CHECK( - hipMemcpy( - output_bin.data(), device_output_bin, - output_bin.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_bin.data(), + device_output_bin, + output_bin.size() * sizeof(T), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < output_bin.size(); i++) { - ASSERT_EQ( - output_bin[i], expected_bin[i] - ); + ASSERT_EQ(output_bin[i], expected_bin[i]); } HIP_CHECK(hipFree(device_output)); diff --git a/test/rocprim/test_block_load_store.cpp b/test/rocprim/test_block_load_store.cpp index 4a16d1171..b604d1d31 100644 --- a/test/rocprim/test_block_load_store.cpp +++ b/test/rocprim/test_block_load_store.cpp @@ -21,8 +21,8 @@ // SOFTWARE. #include -#include #include +#include // Google Test #include @@ -34,231 +34,366 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; -template< - class T, - class U, - unsigned int ItemsPerThread, - bool ShouldBeVectorized -> +template struct params { - using type = T; - using vector_type = U; - static constexpr unsigned int items_per_thread = ItemsPerThread; - static constexpr bool should_be_vectorized = ShouldBeVectorized; + using type = T; + using vector_type = U; + static constexpr unsigned int items_per_thread = ItemsPerThread; + static constexpr bool should_be_vectorized = ShouldBeVectorized; }; -template< - class Type, - rp::block_load_method Load, - rp::block_store_method Store, - unsigned int BlockSize, - unsigned int ItemsPerThread -> +template struct class_params { - using type = Type; - static constexpr rp::block_load_method load_method = Load; - static constexpr rp::block_store_method store_method = Store; - static constexpr unsigned int block_size = BlockSize; - static constexpr unsigned int items_per_thread = ItemsPerThread; + using type = Type; + static constexpr rp::block_load_method load_method = Load; + static constexpr rp::block_store_method store_method = Store; + static constexpr unsigned int block_size = BlockSize; + static constexpr unsigned int items_per_thread = ItemsPerThread; }; -template -class RocprimBlockLoadStoreClassTests : public ::testing::Test { +template +class RocprimBlockLoadStoreClassTests : public ::testing::Test +{ public: using params = ClassParams; }; -template -class RocprimVectorizationTests : public ::testing::Test { +template +class RocprimVectorizationTests : public ::testing::Test +{ public: using params = Params; }; typedef ::testing::Types< // block_load_direct - class_params, - class_params, - class_params, - class_params, - class_params, - class_params, - - class_params, - class_params, - class_params, - class_params, - class_params, - class_params, - - class_params, rp::block_load_method::block_load_direct, - rp::block_store_method::block_store_direct, 64U, 1>, - class_params, rp::block_load_method::block_load_direct, - rp::block_store_method::block_store_direct, 64U, 5>, - class_params, rp::block_load_method::block_load_direct, - rp::block_store_method::block_store_direct, 256U, 1>, - class_params, rp::block_load_method::block_load_direct, - rp::block_store_method::block_store_direct, 256U, 4>, + class_params, + class_params, + class_params, + class_params, + class_params, + class_params, + + class_params, + class_params, + class_params, + class_params, + class_params, + class_params, + + class_params, + rp::block_load_method::block_load_direct, + rp::block_store_method::block_store_direct, + 64U, + 1>, + class_params, + rp::block_load_method::block_load_direct, + rp::block_store_method::block_store_direct, + 64U, + 5>, + class_params, + rp::block_load_method::block_load_direct, + rp::block_store_method::block_store_direct, + 256U, + 1>, + class_params, + rp::block_load_method::block_load_direct, + rp::block_store_method::block_store_direct, + 256U, + 4>, // block_load_vectorize - class_params, - class_params, - class_params, - class_params, - class_params, - class_params, - - class_params, - class_params, - class_params, - class_params, - class_params, - class_params, - - class_params, rp::block_load_method::block_load_vectorize, - rp::block_store_method::block_store_vectorize, 64U, 1>, - class_params, rp::block_load_method::block_load_vectorize, - rp::block_store_method::block_store_vectorize, 64U, 4>, - class_params, rp::block_load_method::block_load_vectorize, - rp::block_store_method::block_store_vectorize, 256U, 1>, - class_params, rp::block_load_method::block_load_vectorize, - rp::block_store_method::block_store_vectorize, 256U, 4>, + class_params, + class_params, + class_params, + class_params, + class_params, + class_params, + + class_params, + class_params, + class_params, + class_params, + class_params, + class_params, + + class_params, + rp::block_load_method::block_load_vectorize, + rp::block_store_method::block_store_vectorize, + 64U, + 1>, + class_params, + rp::block_load_method::block_load_vectorize, + rp::block_store_method::block_store_vectorize, + 64U, + 4>, + class_params, + rp::block_load_method::block_load_vectorize, + rp::block_store_method::block_store_vectorize, + 256U, + 1>, + class_params, + rp::block_load_method::block_load_vectorize, + rp::block_store_method::block_store_vectorize, + 256U, + 4>, // block_load_transpose - class_params, - class_params, - class_params, - class_params, - class_params, - class_params, - - class_params, - class_params, - class_params, - class_params, - class_params, - class_params, - - class_params, rp::block_load_method::block_load_transpose, - rp::block_store_method::block_store_transpose, 64U, 1>, - class_params, rp::block_load_method::block_load_transpose, - rp::block_store_method::block_store_transpose, 64U, 5>, - class_params, rp::block_load_method::block_load_transpose, - rp::block_store_method::block_store_transpose, 256U, 1>, - class_params, rp::block_load_method::block_load_transpose, - rp::block_store_method::block_store_transpose, 256U, 4> - -> ClassParams; - -typedef ::testing::Types< - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params -> Params; + class_params, + class_params, + class_params, + class_params, + class_params, + class_params, + + class_params, + class_params, + class_params, + class_params, + class_params, + class_params, + + class_params, + rp::block_load_method::block_load_transpose, + rp::block_store_method::block_store_transpose, + 64U, + 1>, + class_params, + rp::block_load_method::block_load_transpose, + rp::block_store_method::block_store_transpose, + 64U, + 5>, + class_params, + rp::block_load_method::block_load_transpose, + rp::block_store_method::block_store_transpose, + 256U, + 1>, + class_params, + rp::block_load_method::block_load_transpose, + rp::block_store_method::block_store_transpose, + 256U, + 4> + + > + ClassParams; + +typedef ::testing::Types, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params> + Params; TYPED_TEST_CASE(RocprimBlockLoadStoreClassTests, ClassParams); TYPED_TEST_CASE(RocprimVectorizationTests, Params); -template< - class Type, - rp::block_load_method LoadMethod, - rp::block_store_method StoreMethod, - unsigned int BlockSize, - unsigned int ItemsPerThread -> -__global__ -void load_store_kernel(Type* device_input, Type* device_output) +template +__global__ void load_store_kernel(Type* device_input, Type* device_output) { - Type items[ItemsPerThread]; + Type items[ItemsPerThread]; unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread; - rp::block_load load; + rp::block_load load; rp::block_store store; load.load(device_input + offset, items); store.store(device_output + offset, items); @@ -266,14 +401,14 @@ void load_store_kernel(Type* device_input, Type* device_output) TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) { - using Type = typename TestFixture::params::type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr rp::block_load_method load_method = TestFixture::params::load_method; - constexpr rp::block_store_method store_method = TestFixture::params::store_method; - const size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 113; - const auto grid_size = size / items_per_block; + using Type = typename TestFixture::params::type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr rp::block_load_method load_method = TestFixture::params::load_method; + constexpr rp::block_store_method store_method = TestFixture::params::store_method; + const size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) { @@ -286,10 +421,10 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) // Calculate expected results on host std::vector expected(input.size(), 0); - for (size_t i = 0; i < 113; i++) + for(size_t i = 0; i < 113; i++) { size_t block_offset = i * items_per_block; - for (size_t j = 0; j < items_per_block; j++) + for(size_t j = 0; j < items_per_block; j++) { expected[j + block_offset] = input[j + block_offset]; } @@ -297,38 +432,33 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) // Preparing device Type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); Type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(typename decltype(input)::value_type), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(device_input, + input.data(), + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - load_store_kernel< - Type, load_method, store_method, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); + load_store_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); // Reading results from device - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -337,19 +467,16 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - rp::block_load_method LoadMethod, - rp::block_store_method StoreMethod, - unsigned int BlockSize, - unsigned int ItemsPerThread -> -__global__ -void load_store_valid_kernel(Type* device_input, Type* device_output, size_t valid) +template +__global__ void load_store_valid_kernel(Type* device_input, Type* device_output, size_t valid) { - Type items[ItemsPerThread]; + Type items[ItemsPerThread]; unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread; - rp::block_load load; + rp::block_load load; rp::block_store store; load.load(device_input + offset, items, valid); store.store(device_output + offset, items, valid); @@ -357,14 +484,14 @@ void load_store_valid_kernel(Type* device_input, Type* device_output, size_t val TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) { - using Type = typename TestFixture::params::type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr rp::block_load_method load_method = TestFixture::params::load_method; - constexpr rp::block_store_method store_method = TestFixture::params::store_method; - const size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 113; - const auto grid_size = size / items_per_block; + using Type = typename TestFixture::params::type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr rp::block_load_method load_method = TestFixture::params::load_method; + constexpr rp::block_store_method store_method = TestFixture::params::store_method; + const size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) { @@ -378,12 +505,12 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) // Calculate expected results on host std::vector expected(input.size(), 0); - for (size_t i = 0; i < 113; i++) + for(size_t i = 0; i < 113; i++) { size_t block_offset = i * items_per_block; - for (size_t j = 0; j < items_per_block; j++) + for(size_t j = 0; j < items_per_block; j++) { - if (j < valid) + if(j < valid) { expected[j + block_offset] = input[j + block_offset]; } @@ -392,47 +519,40 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) // Preparing device Type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); Type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(typename decltype(input)::value_type), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(device_input, + input.data(), + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyHostToDevice)); // Have to initialize output for unvalid data to make sure they are not changed - HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_output, + output.data(), + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - load_store_valid_kernel< - Type, load_method, store_method, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, valid - ); + load_store_valid_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + valid); // Reading results from device - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -441,19 +561,19 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) HIP_CHECK(hipFree(device_output)); } -template< - class Type, - rp::block_load_method LoadMethod, - rp::block_store_method StoreMethod, - unsigned int BlockSize, - unsigned int ItemsPerThread -> -__global__ -void load_store_valid_default_kernel(Type* device_input, Type* device_output, size_t valid, int _default) +template +__global__ void load_store_valid_default_kernel(Type* device_input, + Type* device_output, + size_t valid, + int _default) { - Type items[ItemsPerThread]; + Type items[ItemsPerThread]; unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread; - rp::block_load load; + rp::block_load load; rp::block_store store; load.load(device_input + offset, items, valid, _default); store.store(device_output + offset, items); @@ -461,34 +581,34 @@ void load_store_valid_default_kernel(Type* device_input, Type* device_output, si TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) { - using Type = typename TestFixture::params::type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr rp::block_load_method load_method = TestFixture::params::load_method; - constexpr rp::block_store_method store_method = TestFixture::params::store_method; - const size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 113; - const auto grid_size = size / items_per_block; + using Type = typename TestFixture::params::type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr rp::block_load_method load_method = TestFixture::params::load_method; + constexpr rp::block_store_method store_method = TestFixture::params::store_method; + const size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr auto items_per_block = block_size * items_per_thread; + const size_t size = items_per_block * 113; + const auto grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) { return; } - const size_t valid = items_per_thread + 1; - int _default = -1; + const size_t valid = items_per_thread + 1; + int _default = -1; // Generate data std::vector input = test_utils::get_random_data(size, -100, 100); std::vector output(input.size(), 0); // Calculate expected results on host std::vector expected(input.size(), _default); - for (size_t i = 0; i < 113; i++) + for(size_t i = 0; i < 113; i++) { size_t block_offset = i * items_per_block; - for (size_t j = 0; j < items_per_block; j++) + for(size_t j = 0; j < items_per_block; j++) { - if (j < valid) + if(j < valid) { expected[j + block_offset] = input[j + block_offset]; } @@ -497,38 +617,37 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) // Preparing device Type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); Type* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(typename decltype(input)::value_type), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(device_input, + input.data(), + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - load_store_valid_default_kernel< - Type, load_method, store_method, - block_size, items_per_thread - > - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, valid, _default - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(load_store_valid_default_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + valid, + _default); // Reading results from device - HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + device_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -539,17 +658,17 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) TYPED_TEST(RocprimVectorizationTests, IsVectorizable) { - using T = typename TestFixture::params::type; - constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr bool should_be_vectorized = TestFixture::params::should_be_vectorized; - bool input = rp::detail::is_vectorizable(); + using T = typename TestFixture::params::type; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr bool should_be_vectorized = TestFixture::params::should_be_vectorized; + bool input = rp::detail::is_vectorizable(); ASSERT_EQ(input, should_be_vectorized); } TYPED_TEST(RocprimVectorizationTests, MatchVectorType) { - using T = typename TestFixture::params::type; - using U = typename TestFixture::params::vector_type; + using T = typename TestFixture::params::type; + using U = typename TestFixture::params::vector_type; constexpr size_t items_per_thread = TestFixture::params::items_per_thread; typedef typename rp::detail::match_vector_type::type Vector; bool input = std::is_same::value; diff --git a/test/rocprim/test_block_radix_sort.cpp b/test/rocprim/test_block_radix_sort.cpp index 808b6dd9a..7a3fc4d93 100644 --- a/test/rocprim/test_block_radix_sort.cpp +++ b/test/rocprim/test_block_radix_sort.cpp @@ -24,8 +24,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -38,35 +38,34 @@ namespace rp = rocprim; -template< - class Key, - class Value, - unsigned int BlockSize, - unsigned int ItemsPerThread, - bool Descending = false, - bool ToStriped = false, - unsigned int StartBit = 0, - unsigned int EndBit = sizeof(Key) * 8 -> +template struct params { - using key_type = Key; - using value_type = Value; - static constexpr unsigned int block_size = BlockSize; + using key_type = Key; + using value_type = Value; + static constexpr unsigned int block_size = BlockSize; static constexpr unsigned int items_per_thread = ItemsPerThread; - static constexpr bool descending = Descending; - static constexpr bool to_striped = ToStriped; - static constexpr unsigned int start_bit = StartBit; - static constexpr unsigned int end_bit = EndBit; + static constexpr bool descending = Descending; + static constexpr bool to_striped = ToStriped; + static constexpr unsigned int start_bit = StartBit; + static constexpr unsigned int end_bit = EndBit; }; -template -class RocprimBlockRadixSort : public ::testing::Test { +template +class RocprimBlockRadixSort : public ::testing::Test +{ public: using params = Params; }; -using custom_int2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< @@ -104,26 +103,27 @@ typedef ::testing::Types< // Stability (a number of key values is lower than BlockSize * ItemsPerThread: some keys appear // multiple times with different values or key parts outside [StartBit, EndBit)) params, - params -> Params; + params> + Params; TYPED_TEST_CASE(RocprimBlockRadixSort, Params); -template +template struct key_comparator { - static_assert(rp::is_unsigned::value, "Test supports start and end bits only for unsigned integers"); + static_assert(rp::is_unsigned::value, + "Test supports start and end bits only for unsigned integers"); bool operator()(const Key& lhs, const Key& rhs) { auto mask = (1ull << (EndBit - StartBit)) - 1; - auto l = (static_cast(lhs) >> StartBit) & mask; - auto r = (static_cast(rhs) >> StartBit) & mask; + auto l = (static_cast(lhs) >> StartBit) & mask; + auto r = (static_cast(rhs) >> StartBit) & mask; return Descending ? (r < l) : (l < r); } }; -template +template struct key_comparator { bool operator()(const Key& lhs, const Key& rhs) @@ -132,7 +132,7 @@ struct key_comparator } }; -template +template struct key_comparator { bool operator()(const rp::half& lhs, const rp::half& rhs) @@ -142,7 +142,7 @@ struct key_comparator } }; -template +template struct key_value_comparator { bool operator()(const std::pair& lhs, const std::pair& rhs) @@ -151,22 +151,16 @@ struct key_value_comparator } }; -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type -> -__global__ -void sort_key_kernel( - key_type* device_keys_output, - bool to_striped, - bool descending, - unsigned int start_bit, - unsigned int end_bit) +template +__global__ void sort_key_kernel(key_type* device_keys_output, + bool to_striped, + bool descending, + unsigned int start_bit, + unsigned int end_bit) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; key_type keys[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_keys_output + block_offset, keys); @@ -195,75 +189,70 @@ void sort_key_kernel( TYPED_TEST(RocprimBlockRadixSort, SortKeys) { - using key_type = typename TestFixture::params::key_type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr bool descending = TestFixture::params::descending; - constexpr bool to_striped = TestFixture::params::to_striped; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr size_t items_per_block = block_size * items_per_thread; + using key_type = typename TestFixture::params::key_type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr bool descending = TestFixture::params::descending; + constexpr bool to_striped = TestFixture::params::to_striped; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { return; } - const size_t size = items_per_block * 1134; + const size_t size = items_per_block * 1134; const size_t grid_size = size / items_per_block; // Generate data std::vector keys_output; if(rp::is_floating_point::value) { - keys_output = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_output + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_output = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } // Calculate expected results on host std::vector expected(keys_output); for(size_t i = 0; i < size / items_per_block; i++) { - std::stable_sort( - expected.begin() + (i * items_per_block), - expected.begin() + ((i + 1) * items_per_block), - key_comparator() - ); + std::stable_sort(expected.begin() + (i * items_per_block), + expected.begin() + ((i + 1) * items_per_block), + key_comparator()); } // Preparing device key_type* device_keys_output; HIP_CHECK(hipMalloc(&device_keys_output, keys_output.size() * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - device_keys_output, keys_output.data(), - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_keys_output, + keys_output.data(), + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_key_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_keys_output, to_striped, descending, start_bit, end_bit - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_key_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_output, + to_striped, + descending, + start_bit, + end_bit); // Getting results to host - HIP_CHECK( - hipMemcpy( - keys_output.data(), device_keys_output, - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + device_keys_output, + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyDeviceToHost)); // Verifying results ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(keys_output, expected)); @@ -271,26 +260,19 @@ TYPED_TEST(RocprimBlockRadixSort, SortKeys) HIP_CHECK(hipFree(device_keys_output)); } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - class key_type, - class value_type -> -__global__ -void sort_key_value_kernel( - key_type* device_keys_output, - value_type* device_values_output, - bool to_striped, - bool descending, - unsigned int start_bit, - unsigned int end_bit) +template +__global__ void sort_key_value_kernel(key_type* device_keys_output, + value_type* device_values_output, + bool to_striped, + bool descending, + unsigned int start_bit, + unsigned int end_bit) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int lid = hipThreadIdx_x; + const unsigned int block_offset = hipBlockIdx_x * items_per_block; - key_type keys[ItemsPerThread]; + key_type keys[ItemsPerThread]; value_type values[ItemsPerThread]; rp::block_load_direct_blocked(lid, device_keys_output + block_offset, keys); rp::block_load_direct_blocked(lid, device_values_output + block_offset, values); @@ -318,39 +300,36 @@ void sort_key_value_kernel( } } - TYPED_TEST(RocprimBlockRadixSort, SortKeysValues) { - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr bool descending = TestFixture::params::descending; - constexpr bool to_striped = TestFixture::params::to_striped; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr size_t items_per_block = block_size * items_per_thread; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr size_t block_size = TestFixture::params::block_size; + constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + constexpr bool descending = TestFixture::params::descending; + constexpr bool to_striped = TestFixture::params::to_striped; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { return; } - const size_t size = items_per_block * 1134; + const size_t size = items_per_block * 1134; const size_t grid_size = size / items_per_block; // Generate data std::vector keys_output; if(rp::is_floating_point::value) { - keys_output = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_output + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_output = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_output = test_utils::get_random_data(size, 0, 100); @@ -369,15 +348,14 @@ TYPED_TEST(RocprimBlockRadixSort, SortKeysValues) std::stable_sort( expected.begin() + (i * items_per_block), expected.begin() + ((i + 1) * items_per_block), - key_value_comparator() - ); + key_value_comparator()); } - std::vector keys_expected(size); + std::vector keys_expected(size); std::vector values_expected(size); for(size_t i = 0; i < size; i++) { - keys_expected[i] = expected[i].first; + keys_expected[i] = expected[i].first; values_expected[i] = expected[i].second; } @@ -386,45 +364,40 @@ TYPED_TEST(RocprimBlockRadixSort, SortKeysValues) value_type* device_values_output; HIP_CHECK(hipMalloc(&device_values_output, values_output.size() * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - device_keys_output, keys_output.data(), - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyHostToDevice - ) - ); - - HIP_CHECK( - hipMemcpy( - device_values_output, values_output.data(), - values_output.size() * sizeof(typename decltype(values_output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_keys_output, + keys_output.data(), + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_values_output, + values_output.data(), + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_key_value_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_keys_output, device_values_output, to_striped, descending, start_bit, end_bit - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_keys_output, + device_values_output, + to_striped, + descending, + start_bit, + end_bit); // Getting results to host - HIP_CHECK( - hipMemcpy( - keys_output.data(), device_keys_output, - keys_output.size() * sizeof(typename decltype(keys_output)::value_type), - hipMemcpyDeviceToHost - ) - ); - - HIP_CHECK( - hipMemcpy( - values_output.data(), device_values_output, - values_output.size() * sizeof(typename decltype(values_output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + device_keys_output, + keys_output.size() * sizeof(typename decltype(keys_output)::value_type), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(values_output.data(), + device_values_output, + values_output.size() * sizeof(typename decltype(values_output)::value_type), + hipMemcpyDeviceToHost)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(keys_output, keys_expected)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(values_output, values_expected)); @@ -432,4 +405,3 @@ TYPED_TEST(RocprimBlockRadixSort, SortKeysValues) HIP_CHECK(hipFree(device_keys_output)); HIP_CHECK(hipFree(device_values_output)); } - diff --git a/test/rocprim/test_block_reduce.cpp b/test/rocprim/test_block_reduce.cpp index 01aca3435..1cbbc3994 100644 --- a/test/rocprim/test_block_reduce.cpp +++ b/test/rocprim/test_block_reduce.cpp @@ -34,41 +34,39 @@ namespace rp = rocprim; -template +template T apply(BinaryOp binary_op, const T& a, const T& b) { return binary_op(a, b); } // Params for tests -template< - class T, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U, - rp::block_reduce_algorithm Algorithm = rp::block_reduce_algorithm::using_warp_reduce, - class BinaryOp = rocprim::plus -> +template > struct params { - using type = T; - using binary_op_type = BinaryOp; - static constexpr rp::block_reduce_algorithm algorithm = Algorithm; - static constexpr unsigned int block_size = BlockSize; - static constexpr unsigned int items_per_thread = ItemsPerThread; + using type = T; + using binary_op_type = BinaryOp; + static constexpr rp::block_reduce_algorithm algorithm = Algorithm; + static constexpr unsigned int block_size = BlockSize; + static constexpr unsigned int items_per_thread = ItemsPerThread; }; // --------------------------------------------------------- // Test for reduce ops taking single input value // --------------------------------------------------------- -template +template class RocprimBlockReduceSingleValueTests : public ::testing::Test { public: - using type = typename Params::type; - using binary_op_type = typename Params::binary_op_type; - static constexpr rp::block_reduce_algorithm algorithm = Params::algorithm; - static constexpr unsigned int block_size = Params::block_size; + using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; + static constexpr rp::block_reduce_algorithm algorithm = Params::algorithm; + static constexpr unsigned int block_size = Params::block_size; }; typedef ::testing::Types< @@ -95,9 +93,21 @@ typedef ::testing::Types< params, params, // half tests - params, - params, - params, + params, + params, + params, // long tests params, params, @@ -124,22 +134,19 @@ typedef ::testing::Types< params, params, params, - params -> SingleValueTestParams; + params> + SingleValueTestParams; TYPED_TEST_CASE(RocprimBlockReduceSingleValueTests, SingleValueTestParams); -template< - unsigned int BlockSize, - rocprim::block_reduce_algorithm Algorithm, - class T, - class BinaryOp -> -__global__ -void reduce_kernel(T* device_output, T* device_output_reductions) +template +__global__ void reduce_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; rp::block_reduce breduce; breduce.reduce(value, value, BinaryOp()); if(hipThreadIdx_x == 0) @@ -150,9 +157,9 @@ void reduce_kernel(T* device_output, T* device_output_reductions) TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) { - using T = typename TestFixture::type; - using binary_op_type = typename TestFixture::binary_op_type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Given block size not supported @@ -161,7 +168,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 50); @@ -176,7 +183,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) for(size_t j = 0; j < block_size; j++) { auto idx = i * block_size + j; - value = apply(binary_op, value, output[idx]); + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -188,28 +195,22 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) HIP_CHECK(hipMalloc(&device_output_reductions, output_reductions.size() * sizeof(T))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); // Reading results back - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Verifying results test_utils::assert_eq(output_reductions, expected_reductions); @@ -218,16 +219,11 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) HIP_CHECK(hipFree(device_output_reductions)); } -template< - unsigned int BlockSize, - rocprim::block_reduce_algorithm Algorithm, - class T -> -__global__ -void reduce_multiplies_kernel(T* device_output, T* device_output_reductions) +template +__global__ void reduce_multiplies_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; rp::block_reduce breduce; breduce.reduce(value, value, rocprim::multiplies()); if(hipThreadIdx_x == 0) @@ -236,7 +232,7 @@ void reduce_multiplies_kernel(T* device_output, T* device_output_reductions) } } -template +template T host_multiplies(const T& x, const T& y) { return x * y; @@ -250,8 +246,8 @@ rp::half host_multiplies(const rp::half&, const rp::half&) TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Half not tested here @@ -266,11 +262,11 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output(size, 1); - auto two_places = test_utils::get_random_data(size/32, 0, size-1); + auto two_places = test_utils::get_random_data(size / 32, 0, size - 1); for(auto i : two_places) { output[i] = T(2); @@ -285,7 +281,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) for(size_t j = 0; j < block_size; j++) { auto idx = i * block_size + j; - value = host_multiplies(value, output[idx]); + value = host_multiplies(value, output[idx]); } expected_reductions[i] = value; } @@ -297,28 +293,22 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) HIP_CHECK(hipMalloc(&device_output_reductions, output_reductions.size() * sizeof(T))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_multiplies_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_multiplies_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); // Reading results back - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Verifying results test_utils::assert_eq(output_reductions, expected_reductions); @@ -329,17 +319,16 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) TYPED_TEST_CASE(RocprimBlockReduceSingleValueTests, SingleValueTestParams); -template< - unsigned int BlockSize, - rocprim::block_reduce_algorithm Algorithm, - class T, - class BinaryOp -> -__global__ -void reduce_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items) +template +__global__ void reduce_valid_kernel(T* device_output, + T* device_output_reductions, + const unsigned int valid_items) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; rp::block_reduce breduce; breduce.reduce(value, value, valid_items, BinaryOp()); if(hipThreadIdx_x == 0) @@ -350,10 +339,10 @@ void reduce_valid_kernel(T* device_output, T* device_output_reductions, const un TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) { - using T = typename TestFixture::type; - using binary_op_type = typename TestFixture::binary_op_type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; const unsigned int valid_items = test_utils::get_random_value(block_size - 10, block_size); // Given block size not supported @@ -362,7 +351,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 50); @@ -377,7 +366,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) for(size_t j = 0; j < valid_items; j++) { auto idx = i * block_size + j; - value = apply(binary_op, value, output[idx]); + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -389,28 +378,24 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) HIP_CHECK(hipMalloc(&device_output_reductions, output_reductions.size() * sizeof(T))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(reduce_valid_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions, valid_items - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions, + valid_items); // Reading results back - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Verifying results test_utils::assert_eq(output_reductions, expected_reductions); @@ -419,68 +404,76 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) HIP_CHECK(hipFree(device_output_reductions)); } - -template +template class RocprimBlockReduceInputArrayTests : public ::testing::Test { public: - using type = typename Params::type; - using binary_op_type = typename Params::binary_op_type; - static constexpr unsigned int block_size = Params::block_size; - static constexpr rocprim::block_reduce_algorithm algorithm = Params::algorithm; - static constexpr unsigned int items_per_thread = Params::items_per_thread; + using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; + static constexpr unsigned int block_size = Params::block_size; + static constexpr rocprim::block_reduce_algorithm algorithm = Params::algorithm; + static constexpr unsigned int items_per_thread = Params::items_per_thread; }; typedef ::testing::Types< // ----------------------------------------------------------------------- // rocprim::block_reduce_algorithm::using_warp_reduce // ----------------------------------------------------------------------- - params, - params, - params, - params, + params, + params, + params, + params, params, - params, - params, - params, - params, + params, + params, + params, + params, params, - params, - params, - params, - params, - params, + params, + params, + params, + params, + params, // ----------------------------------------------------------------------- // rocprim::block_reduce_algorithm::raking_reduce // ----------------------------------------------------------------------- - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params -> InputArrayTestParams; + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params> + InputArrayTestParams; TYPED_TEST_CASE(RocprimBlockReduceInputArrayTests, InputArrayTestParams); -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_reduce_algorithm Algorithm, - class T, - class BinaryOp -> -__global__ -void reduce_array_kernel(T* device_output, T* device_output_reductions) +template +__global__ void reduce_array_kernel(T* device_output, T* device_output_reductions) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; // load @@ -491,7 +484,7 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) } rp::block_reduce breduce; - T reduction; + T reduction; breduce.reduce(in_out, reduction, BinaryOp()); if(hipThreadIdx_x == 0) @@ -502,10 +495,10 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) { - using T = typename TestFixture::type; - using binary_op_type = typename TestFixture::binary_op_type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + using binary_op_type = typename TestFixture::binary_op_type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -515,8 +508,8 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 2, 50); @@ -532,7 +525,7 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) for(size_t j = 0; j < items_per_block; j++) { auto idx = i * items_per_block + j; - value = apply(binary_op, value, output[idx]); + value = apply(binary_op, value, output[idx]); } expected_reductions[i] = value; } @@ -544,36 +537,29 @@ TYPED_TEST(RocprimBlockReduceInputArrayTests, Reduce) HIP_CHECK(hipMalloc(&device_output_reductions, output_reductions.size() * sizeof(T))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK( - hipMemcpy( - device_output_reductions, output_reductions.data(), - output_reductions.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_output_reductions, + output_reductions.data(), + output_reductions.size() * sizeof(T), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(reduce_array_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions - ); + HIP_KERNEL_NAME( + reduce_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); // Reading results back - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Verifying results test_utils::assert_near(output_reductions, expected_reductions, 0.05); diff --git a/test/rocprim/test_block_scan.cpp b/test/rocprim/test_block_scan.cpp index b7c514f28..d24d1f962 100644 --- a/test/rocprim/test_block_scan.cpp +++ b/test/rocprim/test_block_scan.cpp @@ -35,34 +35,32 @@ namespace rp = rocprim; // Params for tests -template< - class T, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U, - rocprim::block_scan_algorithm Algorithm = rocprim::block_scan_algorithm::using_warp_scan, - class BinaryOp = rocprim::plus -> +template > struct params { - using type = T; - using binary_op_type = BinaryOp; - static constexpr rocprim::block_scan_algorithm algorithm = Algorithm; - static constexpr unsigned int block_size = BlockSize; - static constexpr unsigned int items_per_thread = ItemsPerThread; + using type = T; + using binary_op_type = BinaryOp; + static constexpr rocprim::block_scan_algorithm algorithm = Algorithm; + static constexpr unsigned int block_size = BlockSize; + static constexpr unsigned int items_per_thread = ItemsPerThread; }; // --------------------------------------------------------- // Test for scan ops taking single input value // --------------------------------------------------------- -template +template class RocprimBlockScanSingleValueTests : public ::testing::Test { public: - using type = typename Params::type; - using binary_op_type = typename Params::binary_op_type; - static constexpr rocprim::block_scan_algorithm algorithm = Params::algorithm; - static constexpr unsigned int block_size = Params::block_size; + using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; + static constexpr rocprim::block_scan_algorithm algorithm = Params::algorithm; + static constexpr unsigned int block_size = Params::block_size; }; typedef ::testing::Types< @@ -107,22 +105,23 @@ typedef ::testing::Types< params, params, params, - params, 140, 1, rocprim::block_scan_algorithm::reduce_then_scan>, - params, 201U, 1, rocprim::block_scan_algorithm::reduce_then_scan> -> SingleValueTestParams; + params, + 140, + 1, + rocprim::block_scan_algorithm::reduce_then_scan>, + params, + 201U, + 1, + rocprim::block_scan_algorithm::reduce_then_scan>> + SingleValueTestParams; TYPED_TEST_CASE(RocprimBlockScanSingleValueTests, SingleValueTestParams); -template< - unsigned int BlockSize, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void inclusive_scan_kernel(T* device_output) +template +__global__ void inclusive_scan_kernel(T* device_output) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; rp::block_scan bscan; bscan.inclusive_scan(value, value); device_output[index] = value; @@ -130,8 +129,8 @@ void inclusive_scan_kernel(T* device_output) TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScan) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Given block size not supported @@ -140,7 +139,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScan) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); @@ -151,41 +150,33 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScan) { for(size_t j = 0; j < block_size; j++) { - auto idx = i * block_size + j; - expected[idx] = output[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * block_size + j; + expected[idx] = output[idx] + expected[j > 0 ? idx - 1 : idx]; } } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(inclusive_scan_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); @@ -193,17 +184,12 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScan) HIP_CHECK(hipFree(device_output)); } -template< - unsigned int BlockSize, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions) +template +__global__ void inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; - T reduction; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + T reduction; rp::block_scan bscan; bscan.inclusive_scan(value, value, reduction); device_output[index] = value; @@ -215,8 +201,8 @@ void inclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions) TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanReduce) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Given block size not supported @@ -225,7 +211,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanReduce) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); @@ -238,57 +224,44 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanReduce) { for(size_t j = 0; j < block_size; j++) { - auto idx = i * block_size + j; - expected[idx] = output[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * block_size + j; + expected[idx] = output[idx] + expected[j > 0 ? idx - 1 : idx]; } - expected_reductions[i] = expected[(i+1) * block_size - 1]; + expected_reductions[i] = expected[(i + 1) * block_size - 1]; } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_reductions; HIP_CHECK( - hipMalloc( - &device_output_reductions, - output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(hipMalloc(&device_output_reductions, + output_reductions.size() + * sizeof(typename decltype(output_reductions)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(inclusive_scan_reduce_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); @@ -298,19 +271,14 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanReduce) HIP_CHECK(hipFree(device_output_reductions)); } -template< - unsigned int BlockSize, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void inclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) +template +__global__ void + inclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T prefix_value = block_prefix; - auto prefix_callback = [&prefix_value](T reduction) - { - T prefix = prefix_value; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) { + T prefix = prefix_value; prefix_value = prefix_value + reduction; return prefix; }; @@ -330,8 +298,8 @@ void inclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanPrefixCallback) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Given block size not supported @@ -340,12 +308,12 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanPrefixCallback) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / block_size); - T block_prefix = test_utils::get_random_value(0, 100); + T block_prefix = test_utils::get_random_value(0, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -355,76 +323,61 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanPrefixCallback) expected[i * block_size] = block_prefix; for(size_t j = 0; j < block_size; j++) { - auto idx = i * block_size + j; - expected[idx] = output[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * block_size + j; + expected[idx] = output[idx] + expected[j > 0 ? idx - 1 : idx]; } - expected_block_prefixes[i] = expected[(i+1) * block_size - 1]; + expected_block_prefixes[i] = expected[(i + 1) * block_size - 1]; } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_bp; HIP_CHECK( - hipMalloc( - &device_output_bp, - output_block_prefixes.size() * sizeof(typename decltype(output_block_prefixes)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(hipMalloc(&device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(inclusive_scan_prefix_callback_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_bp, block_prefix - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_block_prefixes.data(), device_output_bp, - output_block_prefixes.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_bp)); } -template< - unsigned int BlockSize, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void exclusive_scan_kernel(T* device_output, T init) +template +__global__ void exclusive_scan_kernel(T* device_output, T init) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; rp::block_scan bscan; bscan.exclusive_scan(value, value, init); device_output[index] = value; @@ -432,8 +385,8 @@ void exclusive_scan_kernel(T* device_output, T init) TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScan) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Given block size not supported @@ -442,11 +395,11 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScan) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -455,41 +408,34 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScan) expected[i * block_size] = init; for(size_t j = 1; j < block_size; j++) { - auto idx = i * block_size + j; - expected[idx] = output[idx-1] + expected[idx-1]; + auto idx = i * block_size + j; + expected[idx] = output[idx - 1] + expected[idx - 1]; } } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(exclusive_scan_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, init - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); @@ -497,17 +443,12 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScan) HIP_CHECK(hipFree(device_output)); } -template< - unsigned int BlockSize, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void exclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions, T init) +template +__global__ void exclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions, T init) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T value = device_output[index]; - T reduction; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T value = device_output[index]; + T reduction; rp::block_scan bscan; bscan.exclusive_scan(value, value, init, reduction); device_output[index] = value; @@ -519,8 +460,8 @@ void exclusive_scan_reduce_kernel(T* device_output, T* device_output_reductions, TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanReduce) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; if(block_size > test_utils::get_max_block_size()) @@ -528,11 +469,11 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanReduce) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Output reduce results std::vector output_reductions(size / block_size); @@ -545,63 +486,51 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanReduce) expected[i * block_size] = init; for(size_t j = 1; j < block_size; j++) { - auto idx = i * block_size + j; - expected[idx] = output[idx-1] + expected[idx-1]; + auto idx = i * block_size + j; + expected[idx] = output[idx - 1] + expected[idx - 1]; } expected_reductions[i] = 0; for(size_t j = 0; j < block_size; j++) { - auto idx = i * block_size + j; + auto idx = i * block_size + j; expected_reductions[i] = expected_reductions[i] + output[idx]; } } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_reductions; HIP_CHECK( - hipMalloc( - &device_output_reductions, - output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(hipMalloc(&device_output_reductions, + output_reductions.size() + * sizeof(typename decltype(output_reductions)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(exclusive_scan_reduce_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions, init - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); @@ -611,19 +540,14 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanReduce) HIP_CHECK(hipFree(device_output_reductions)); } -template< - unsigned int BlockSize, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void exclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) +template +__global__ void + exclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - T prefix_value = block_prefix; - auto prefix_callback = [&prefix_value](T reduction) - { - T prefix = prefix_value; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) { + T prefix = prefix_value; prefix_value = prefix_value + reduction; return prefix; }; @@ -643,8 +567,8 @@ void exclusive_scan_prefix_callback_kernel(T* device_output, T* device_output_bp TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanPrefixCallback) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; constexpr size_t block_size = TestFixture::block_size; // Given block size not supported @@ -653,12 +577,12 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanPrefixCallback) return; } - const size_t size = block_size * 113; + const size_t size = block_size * 113; const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / block_size); - T block_prefix = test_utils::get_random_value(0, 100); + T block_prefix = test_utils::get_random_value(0, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -668,67 +592,57 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanPrefixCallback) expected[i * block_size] = block_prefix; for(size_t j = 1; j < block_size; j++) { - auto idx = i * block_size + j; - expected[idx] = output[idx-1] + expected[idx-1]; + auto idx = i * block_size + j; + expected[idx] = output[idx - 1] + expected[idx - 1]; } expected_block_prefixes[i] = block_prefix; for(size_t j = 0; j < block_size; j++) { - auto idx = i * block_size + j; + auto idx = i * block_size + j; expected_block_prefixes[i] = expected_block_prefixes[i] + output[idx]; } } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_bp; HIP_CHECK( - hipMalloc( - &device_output_bp, - output_block_prefixes.size() * sizeof(typename decltype(output_block_prefixes)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(hipMalloc(&device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(exclusive_scan_prefix_callback_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_bp, block_prefix - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_block_prefixes.data(), device_output_bp, - output_block_prefixes.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_bp)); @@ -738,66 +652,69 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanPrefixCallback) // Test for scan ops taking array of values as input // --------------------------------------------------------- -template +template class RocprimBlockScanInputArrayTests : public ::testing::Test { public: - using type = typename Params::type; - using binary_op_type = typename Params::binary_op_type; - static constexpr unsigned int block_size = Params::block_size; - static constexpr rocprim::block_scan_algorithm algorithm = Params::algorithm; - static constexpr unsigned int items_per_thread = Params::items_per_thread; + using type = typename Params::type; + using binary_op_type = typename Params::binary_op_type; + static constexpr unsigned int block_size = Params::block_size; + static constexpr rocprim::block_scan_algorithm algorithm = Params::algorithm; + static constexpr unsigned int items_per_thread = Params::items_per_thread; }; typedef ::testing::Types< // ----------------------------------------------------------------------- // rocprim::block_scan_algorithm::using_warp_scan // ----------------------------------------------------------------------- - params, - params, - params, - params, + params, + params, + params, + params, params, - params, - params, - params, - params, + params, + params, + params, + params, params, - params, - params, + params, + params, params, 110, 4>, params, 256U, 3>, // ----------------------------------------------------------------------- // rocprim::block_scan_algorithm::reduce_then_scan // ----------------------------------------------------------------------- - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, 256, 5, rocprim::block_scan_algorithm::reduce_then_scan>, - params, 180, 3, rocprim::block_scan_algorithm::reduce_then_scan> -> InputArrayTestParams; + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + 256, + 5, + rocprim::block_scan_algorithm::reduce_then_scan>, + params, + 180, + 3, + rocprim::block_scan_algorithm::reduce_then_scan>> + InputArrayTestParams; TYPED_TEST_CASE(RocprimBlockScanInputArrayTests, InputArrayTestParams); -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void inclusive_scan_array_kernel(T* device_output) +template +__global__ void inclusive_scan_array_kernel(T* device_output) { - const unsigned int index = ((hipBlockIdx_x * BlockSize ) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; // load T in_out[ItemsPerThread]; @@ -814,14 +731,13 @@ void inclusive_scan_array_kernel(T* device_output) { device_output[index + j] = in_out[j]; } - } TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScan) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -831,8 +747,8 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScan) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); @@ -842,41 +758,34 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScan) { for(size_t j = 0; j < items_per_block; j++) { - auto idx = i * items_per_block + j; - expected[idx] = output[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * items_per_block + j; + expected[idx] = output[idx] + expected[j > 0 ? idx - 1 : idx]; } } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(inclusive_scan_array_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); @@ -884,16 +793,13 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScan) HIP_CHECK(hipFree(device_output)); } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions) +template +__global__ void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = ((hipBlockIdx_x * BlockSize ) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; // load T in_out[ItemsPerThread]; @@ -903,7 +809,7 @@ void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc } rp::block_scan bscan; - T reduction; + T reduction; bscan.inclusive_scan(in_out, in_out, reduction); // store @@ -920,9 +826,9 @@ void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanReduce) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -932,8 +838,8 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanReduce) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); @@ -947,65 +853,51 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanReduce) { for(size_t j = 0; j < items_per_block; j++) { - auto idx = i * items_per_block + j; - expected[idx] = output[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * items_per_block + j; + expected[idx] = output[idx] + expected[j > 0 ? idx - 1 : idx]; } - expected_reductions[i] = expected[(i+1) * items_per_block - 1]; + expected_reductions[i] = expected[(i + 1) * items_per_block - 1]; } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_reductions; HIP_CHECK( - hipMalloc( - &device_output_reductions, - output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(hipMalloc(&device_output_reductions, + output_reductions.size() + * sizeof(typename decltype(output_reductions)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK( - hipMemcpy( - device_output_reductions, output_reductions.data(), - output_reductions.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_output_reductions, + output_reductions.data(), + output_reductions.size() * sizeof(T), + hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(inclusive_scan_reduce_array_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions - ); + HIP_KERNEL_NAME( + inclusive_scan_reduce_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); @@ -1015,20 +907,18 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanReduce) HIP_CHECK(hipFree(device_output_reductions)); } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) +template +__global__ void inclusive_scan_array_prefix_callback_kernel(T* device_output, + T* device_output_bp, + T block_prefix) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - T prefix_value = block_prefix; - auto prefix_callback = [&prefix_value](T reduction) - { - T prefix = prefix_value; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) { + T prefix = prefix_value; prefix_value = prefix_value + reduction; return prefix; }; @@ -1058,9 +948,9 @@ void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_out TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanPrefixCallback) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -1070,12 +960,12 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanPrefixCallback) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / items_per_block, 0); - T block_prefix = test_utils::get_random_value(0, 100); + T block_prefix = test_utils::get_random_value(0, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -1085,84 +975,68 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, InclusiveScanPrefixCallback) expected[i * items_per_block] = block_prefix; for(size_t j = 0; j < items_per_block; j++) { - auto idx = i * items_per_block + j; - expected[idx] = output[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * items_per_block + j; + expected[idx] = output[idx] + expected[j > 0 ? idx - 1 : idx]; } - expected_block_prefixes[i] = expected[(i+1) * items_per_block - 1]; + expected_block_prefixes[i] = expected[(i + 1) * items_per_block - 1]; } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_bp; HIP_CHECK( - hipMalloc( - &device_output_bp, - output_block_prefixes.size() * sizeof(typename decltype(output_block_prefixes)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(hipMalloc(&device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); - HIP_CHECK( - hipMemcpy( - device_output_bp, output_block_prefixes.data(), - output_block_prefixes.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_output_bp, + output_block_prefixes.data(), + output_block_prefixes.size() * sizeof(T), + hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - inclusive_scan_array_prefix_callback_kernel - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_bp, block_prefix - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(inclusive_scan_array_prefix_callback_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_block_prefixes.data(), device_output_bp, - output_block_prefixes.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_bp)); } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void exclusive_scan_array_kernel(T* device_output, T init) +template +__global__ void exclusive_scan_array_kernel(T* device_output, T init) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; // load @@ -1184,9 +1058,9 @@ void exclusive_scan_array_kernel(T* device_output, T init) TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScan) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -1196,11 +1070,11 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScan) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -1209,41 +1083,35 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScan) expected[i * items_per_block] = init; for(size_t j = 1; j < items_per_block; j++) { - auto idx = i * items_per_block + j; - expected[idx] = output[idx-1] + expected[idx-1]; + auto idx = i * items_per_block + j; + expected[idx] = output[idx - 1] + expected[idx - 1]; } } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(exclusive_scan_array_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, init - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); @@ -1251,14 +1119,12 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScan) HIP_CHECK(hipFree(device_output)); } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions, T init) +template +__global__ void + exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions, T init) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; // load @@ -1269,7 +1135,7 @@ void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc } rp::block_scan bscan; - T reduction; + T reduction; bscan.exclusive_scan(in_out, in_out, init, reduction); // store @@ -1286,9 +1152,9 @@ void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanReduce) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -1298,14 +1164,14 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanReduce) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); // Output reduce results std::vector output_reductions(size / block_size); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -1315,8 +1181,8 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanReduce) expected[i * items_per_block] = init; for(size_t j = 1; j < items_per_block; j++) { - auto idx = i * items_per_block + j; - expected[idx] = output[idx-1] + expected[idx-1]; + auto idx = i * items_per_block + j; + expected[idx] = output[idx - 1] + expected[idx - 1]; } for(size_t j = 0; j < items_per_block; j++) { @@ -1326,75 +1192,57 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanReduce) // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_reductions; HIP_CHECK( - hipMalloc( - &device_output_reductions, - output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(hipMalloc(&device_output_reductions, + output_reductions.size() + * sizeof(typename decltype(output_reductions)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME( - exclusive_scan_reduce_array_kernel - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_reductions, init - ); + exclusive_scan_reduce_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_reductions, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output_reductions, expected_reductions, 0.01)); } -template< - unsigned int BlockSize, - unsigned int ItemsPerThread, - rocprim::block_scan_algorithm Algorithm, - class T -> -__global__ -void exclusive_scan_prefix_callback_array_kernel( - T* device_output, - T* device_output_bp, - T block_prefix -) +template +__global__ void exclusive_scan_prefix_callback_array_kernel(T* device_output, + T* device_output_bp, + T block_prefix) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - T prefix_value = block_prefix; - auto prefix_callback = [&prefix_value](T reduction) - { - T prefix = prefix_value; + T prefix_value = block_prefix; + auto prefix_callback = [&prefix_value](T reduction) { + T prefix = prefix_value; prefix_value = prefix_value + reduction; return prefix; }; @@ -1403,7 +1251,7 @@ void exclusive_scan_prefix_callback_array_kernel( T in_out[ItemsPerThread]; for(unsigned int j = 0; j < ItemsPerThread; j++) { - in_out[j] = device_output[index+ j]; + in_out[j] = device_output[index + j]; } using bscan_t = rp::block_scan; @@ -1424,9 +1272,9 @@ void exclusive_scan_prefix_callback_array_kernel( TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanPrefixCallback) { - using T = typename TestFixture::type; - constexpr auto algorithm = TestFixture::algorithm; - constexpr size_t block_size = TestFixture::block_size; + using T = typename TestFixture::type; + constexpr auto algorithm = TestFixture::algorithm; + constexpr size_t block_size = TestFixture::block_size; constexpr size_t items_per_thread = TestFixture::items_per_thread; // Given block size not supported @@ -1436,12 +1284,12 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanPrefixCallback) } const size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 37; - const size_t grid_size = size / items_per_block; + const size_t size = items_per_block * 37; + const size_t grid_size = size / items_per_block; // Generate data std::vector output = test_utils::get_random_data(size, 2, 100); std::vector output_block_prefixes(size / items_per_block); - T block_prefix = test_utils::get_random_value(0, 100); + T block_prefix = test_utils::get_random_value(0, 100); // Calculate expected results on host std::vector expected(output.size(), 0); @@ -1451,68 +1299,58 @@ TYPED_TEST(RocprimBlockScanInputArrayTests, ExclusiveScanPrefixCallback) expected[i * items_per_block] = block_prefix; for(size_t j = 1; j < items_per_block; j++) { - auto idx = i * items_per_block + j; - expected[idx] = output[idx-1] + expected[idx-1]; + auto idx = i * items_per_block + j; + expected[idx] = output[idx - 1] + expected[idx - 1]; } expected_block_prefixes[i] = block_prefix; for(size_t j = 0; j < items_per_block; j++) { - auto idx = i * items_per_block + j; + auto idx = i * items_per_block + j; expected_block_prefixes[i] = expected_block_prefixes[i] + output[idx]; } } // Writing to device memory T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_bp; HIP_CHECK( - hipMalloc( - &device_output_bp, - output_block_prefixes.size() * sizeof(typename decltype(output_block_prefixes)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_bp; + HIP_CHECK(hipMalloc(&device_output_bp, + output_block_prefixes.size() + * sizeof(typename decltype(output_block_prefixes)::value_type))); HIP_CHECK( - hipMemcpy( - device_output, output.data(), - output.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_output, output.data(), output.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - exclusive_scan_prefix_callback_array_kernel - ), - dim3(grid_size), dim3(block_size), 0, 0, - device_output, device_output_bp, block_prefix - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(exclusive_scan_prefix_callback_array_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_output, + device_output_bp, + block_prefix); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_block_prefixes.data(), device_output_bp, - output_block_prefixes.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_block_prefixes.data(), + device_output_bp, + output_block_prefixes.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_near(output_block_prefixes, expected_block_prefixes, 0.01)); HIP_CHECK(hipFree(device_output)); HIP_CHECK(hipFree(device_output_bp)); diff --git a/test/rocprim/test_block_sort.cpp b/test/rocprim/test_block_sort.cpp index c405bc4c3..6f9e579a8 100644 --- a/test/rocprim/test_block_sort.cpp +++ b/test/rocprim/test_block_sort.cpp @@ -38,27 +38,24 @@ namespace rp = rocprim; -template< - class Key, - class Value, - unsigned int BlockSize -> +template struct params { - using key_type = Key; - using value_type = Value; + using key_type = Key; + using value_type = Value; static constexpr unsigned int block_size = BlockSize; }; -template -class RocprimBlockSortTests : public ::testing::Test { +template +class RocprimBlockSortTests : public ::testing::Test +{ public: - using key_type = typename Params::key_type; - using value_type = typename Params::value_type; + using key_type = typename Params::key_type; + using value_type = typename Params::value_type; static constexpr unsigned int block_size = Params::block_size; }; -using custom_int2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< @@ -76,20 +73,16 @@ typedef ::testing::Types< params, params, params, - params -> BlockSizes; + params> + BlockSizes; TYPED_TEST_CASE(RocprimBlockSortTests, BlockSizes); -template< - unsigned int BlockSize, - class key_type -> -__global__ -void sort_key_kernel(key_type * device_key_output) +template +__global__ void sort_key_kernel(key_type* device_key_output) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - key_type key = device_key_output[index]; + const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + key_type key = device_key_output[index]; rp::block_sort bsort; bsort.sort(key); device_key_output[index] = key; @@ -97,10 +90,10 @@ void sort_key_kernel(key_type * device_key_output) TYPED_TEST(RocprimBlockSortTests, SortKey) { - using key_type = typename TestFixture::key_type; + using key_type = typename TestFixture::key_type; const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 1134; - const size_t grid_size = size / block_size; + const size_t size = block_size * 1134; + const size_t grid_size = size / block_size; // Generate data std::vector output = test_utils::get_random_data(size, -100, 100); @@ -110,39 +103,27 @@ TYPED_TEST(RocprimBlockSortTests, SortKey) for(size_t i = 0; i < output.size() / block_size; i++) { - std::sort( - expected.begin() + (i * block_size), - expected.begin() + ((i + 1) * block_size) - ); + std::sort(expected.begin() + (i * block_size), expected.begin() + ((i + 1) * block_size)); } // Preparing device - key_type * device_key_output; + key_type* device_key_output; HIP_CHECK(hipMalloc(&device_key_output, output.size() * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - device_key_output, output.data(), - output.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + device_key_output, output.data(), output.size() * sizeof(key_type), hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_key_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_key_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_key_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_key_output); // Reading results back - HIP_CHECK( - hipMemcpy( - output.data(), device_key_output, - output.size() * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), device_key_output, output.size() * sizeof(key_type), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -152,52 +133,42 @@ TYPED_TEST(RocprimBlockSortTests, SortKey) HIP_CHECK(hipFree(device_key_output)); } -template< - unsigned int BlockSize, - class key_type, - class value_type -> -__global__ -void sort_key_value_kernel(key_type * device_key_output, value_type * device_value_output) +template +__global__ void sort_key_value_kernel(key_type* device_key_output, value_type* device_value_output) { const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - key_type key = device_key_output[index]; - value_type value = device_value_output[index]; + key_type key = device_key_output[index]; + value_type value = device_value_output[index]; rp::block_sort bsort; bsort.sort(key, value); - device_key_output[index] = key; + device_key_output[index] = key; device_value_output[index] = value; } TYPED_TEST(RocprimBlockSortTests, SortKeyValue) { - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 1134; - const size_t grid_size = size / block_size; + const size_t size = block_size * 1134; + const size_t grid_size = size / block_size; // Generate data std::vector output_key(size); for(size_t i = 0; i < output_key.size() / block_size; i++) { std::iota( - output_key.begin() + (i * block_size), - output_key.begin() + ((i + 1) * block_size), - 0 - ); - - std::shuffle( - output_key.begin() + (i * block_size), - output_key.begin() + ((i + 1) * block_size), - std::mt19937{std::random_device{}()} - ); + output_key.begin() + (i * block_size), output_key.begin() + ((i + 1) * block_size), 0); + + std::shuffle(output_key.begin() + (i * block_size), + output_key.begin() + ((i + 1) * block_size), + std::mt19937 {std::random_device {}()}); } std::vector output_value = test_utils::get_random_data(size, -100, 100); // Combine vectors to form pairs with key and value std::vector> target(size); - for (unsigned i = 0; i < target.size(); i++) + for(unsigned i = 0; i < target.size(); i++) target[i] = std::make_pair(output_key[i], output_value[i]); // Calculate expected results on host @@ -205,57 +176,44 @@ TYPED_TEST(RocprimBlockSortTests, SortKeyValue) for(size_t i = 0; i < expected.size() / block_size; i++) { - std::sort( - expected.begin() + (i * block_size), - expected.begin() + ((i + 1) * block_size) - ); + std::sort(expected.begin() + (i * block_size), expected.begin() + ((i + 1) * block_size)); } // Preparing device - key_type * device_key_output; + key_type* device_key_output; HIP_CHECK(hipMalloc(&device_key_output, output_key.size() * sizeof(key_type))); - value_type * device_value_output; + value_type* device_value_output; HIP_CHECK(hipMalloc(&device_value_output, output_value.size() * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - device_key_output, output_key.data(), - output_key.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - HIP_CHECK( - hipMemcpy( - device_value_output, output_value.data(), - output_value.size() * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_key_output, + output_key.data(), + output_key.size() * sizeof(key_type), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_value_output, + output_value.data(), + output_value.size() * sizeof(value_type), + hipMemcpyHostToDevice)); // Running kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_key_value_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_key_output, device_value_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(sort_key_value_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_key_output, + device_value_output); // Reading results back - HIP_CHECK( - hipMemcpy( - output_key.data(), device_key_output, - output_key.size() * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); - - HIP_CHECK( - hipMemcpy( - output_value.data(), device_value_output, - output_value.size() * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_key.data(), + device_key_output, + output_key.size() * sizeof(key_type), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_value.data(), + device_value_output, + output_value.size() * sizeof(value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < expected.size(); i++) { @@ -264,7 +222,7 @@ TYPED_TEST(RocprimBlockSortTests, SortKeyValue) } } -template +template struct key_value_comparator { bool operator()(const std::pair& lhs, const std::pair& rhs) @@ -273,52 +231,43 @@ struct key_value_comparator } }; -template< - unsigned int BlockSize, - class key_type, - class value_type -> -__global__ -void custom_sort_key_value_kernel(key_type * device_key_output, value_type * device_value_output) +template +__global__ void custom_sort_key_value_kernel(key_type* device_key_output, + value_type* device_value_output) { const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; - key_type key = device_key_output[index]; - value_type value = device_value_output[index]; + key_type key = device_key_output[index]; + value_type value = device_value_output[index]; rp::block_sort bsort; bsort.sort(key, value, rocprim::greater()); - device_key_output[index] = key; + device_key_output[index] = key; device_value_output[index] = value; } TYPED_TEST(RocprimBlockSortTests, CustomSortKeyValue) { - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 1134; - const size_t grid_size = size / block_size; + const size_t size = block_size * 1134; + const size_t grid_size = size / block_size; // Generate data std::vector output_key(size); for(size_t i = 0; i < output_key.size() / block_size; i++) { std::iota( - output_key.begin() + (i * block_size), - output_key.begin() + ((i + 1) * block_size), - 0 - ); - - std::shuffle( - output_key.begin() + (i * block_size), - output_key.begin() + ((i + 1) * block_size), - std::mt19937{std::random_device{}()} - ); + output_key.begin() + (i * block_size), output_key.begin() + ((i + 1) * block_size), 0); + + std::shuffle(output_key.begin() + (i * block_size), + output_key.begin() + ((i + 1) * block_size), + std::mt19937 {std::random_device {}()}); } std::vector output_value = test_utils::get_random_data(size, -100, 100); // Combine vectors to form pairs with key and value std::vector> target(size); - for (unsigned i = 0; i < target.size(); i++) + for(unsigned i = 0; i < target.size(); i++) target[i] = std::make_pair(output_key[i], output_value[i]); // Calculate expected results on host @@ -326,58 +275,47 @@ TYPED_TEST(RocprimBlockSortTests, CustomSortKeyValue) for(size_t i = 0; i < expected.size() / block_size; i++) { - std::sort( - expected.begin() + (i * block_size), - expected.begin() + ((i + 1) * block_size), - key_value_comparator() - ); + std::sort(expected.begin() + (i * block_size), + expected.begin() + ((i + 1) * block_size), + key_value_comparator()); } // Preparing device - key_type * device_key_output; + key_type* device_key_output; HIP_CHECK(hipMalloc(&device_key_output, output_key.size() * sizeof(key_type))); - value_type * device_value_output; + value_type* device_value_output; HIP_CHECK(hipMalloc(&device_value_output, output_value.size() * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - device_key_output, output_key.data(), - output_key.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - HIP_CHECK( - hipMemcpy( - device_value_output, output_value.data(), - output_value.size() * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_key_output, + output_key.data(), + output_key.size() * sizeof(key_type), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(device_value_output, + output_value.data(), + output_value.size() * sizeof(value_type), + hipMemcpyHostToDevice)); // Running kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(custom_sort_key_value_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_key_output, device_value_output - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_key_output, + device_value_output); // Reading results back - HIP_CHECK( - hipMemcpy( - output_key.data(), device_key_output, - output_key.size() * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); - - HIP_CHECK( - hipMemcpy( - output_value.data(), device_value_output, - output_value.size() * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_key.data(), + device_key_output, + output_key.size() * sizeof(key_type), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_value.data(), + device_value_output, + output_value.size() * sizeof(value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < expected.size(); i++) { diff --git a/test/rocprim/test_constant_iterator.cpp b/test/rocprim/test_constant_iterator.cpp index 9fc93dba5..9adccde0b 100644 --- a/test/rocprim/test_constant_iterator.cpp +++ b/test/rocprim/test_constant_iterator.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include +#include #include +#include // Google Test #include @@ -35,39 +35,37 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(error,hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(error, hipSuccess) namespace rp = rocprim; // Params for tests -template +template struct RocprimConstantIteratorParams { using input_type = InputType; }; -template +template class RocprimConstantIteratorTests : public ::testing::Test { public: - using input_type = typename Params::input_type; + using input_type = typename Params::input_type; const bool debug_synchronous = false; }; -typedef ::testing::Types< - RocprimConstantIteratorParams, - RocprimConstantIteratorParams, - RocprimConstantIteratorParams, - RocprimConstantIteratorParams -> RocprimConstantIteratorTestsParams; +typedef ::testing::Types, + RocprimConstantIteratorParams, + RocprimConstantIteratorParams, + RocprimConstantIteratorParams> + RocprimConstantIteratorTestsParams; TYPED_TEST_CASE(RocprimConstantIteratorTests, RocprimConstantIteratorTestsParams); -template +template struct transform { - __device__ __host__ - constexpr T operator()(const T& a) const + __device__ __host__ constexpr T operator()(const T& a) const { return 5 + a; } @@ -75,8 +73,8 @@ struct transform TYPED_TEST(RocprimConstantIteratorTests, Transform) { - using T = typename TestFixture::input_type; - using Iterator = typename rocprim::constant_iterator; + using T = typename TestFixture::input_type; + using Iterator = typename rocprim::constant_iterator; const bool debug_synchronous = TestFixture::debug_synchronous; const size_t size = 1024; @@ -85,10 +83,10 @@ TYPED_TEST(RocprimConstantIteratorTests, Transform) // Create constant_iterator with random starting point const auto value = test_utils::get_random_value(0, 200); - Iterator input_begin(value); + Iterator input_begin(value); std::vector output(size); - T * d_output; + T* d_output; HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(T))); HIP_CHECK(hipDeviceSynchronize()); @@ -97,22 +95,12 @@ TYPED_TEST(RocprimConstantIteratorTests, Transform) // Run HIP_CHECK( - rocprim::transform( - input_begin, d_output, size, - transform(), stream, debug_synchronous - ) - ); + rocprim::transform(input_begin, d_output, size, transform(), stream, debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), d_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Validating results diff --git a/test/rocprim/test_counting_iterator.cpp b/test/rocprim/test_counting_iterator.cpp index bb1350362..399d8f829 100644 --- a/test/rocprim/test_counting_iterator.cpp +++ b/test/rocprim/test_counting_iterator.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include +#include #include +#include // Google Test #include @@ -40,34 +40,32 @@ namespace rp = rocprim; // Params for tests -template +template struct RocprimCountingIteratorParams { using input_type = InputType; }; -template +template class RocprimCountingIteratorTests : public ::testing::Test { public: - using input_type = typename Params::input_type; + using input_type = typename Params::input_type; const bool debug_synchronous = false; }; -typedef ::testing::Types< - RocprimCountingIteratorParams, - RocprimCountingIteratorParams, - RocprimCountingIteratorParams, - RocprimCountingIteratorParams -> RocprimCountingIteratorTestsParams; +typedef ::testing::Types, + RocprimCountingIteratorParams, + RocprimCountingIteratorParams, + RocprimCountingIteratorParams> + RocprimCountingIteratorTestsParams; TYPED_TEST_CASE(RocprimCountingIteratorTests, RocprimCountingIteratorTestsParams); -template +template struct transform { - __device__ __host__ - constexpr T operator()(const T& a) const + __device__ __host__ constexpr T operator()(const T& a) const { return 5 + a; } @@ -75,8 +73,8 @@ struct transform TYPED_TEST(RocprimCountingIteratorTests, Transform) { - using T = typename TestFixture::input_type; - using Iterator = typename rocprim::counting_iterator; + using T = typename TestFixture::input_type; + using Iterator = typename rocprim::counting_iterator; const bool debug_synchronous = TestFixture::debug_synchronous; const size_t size = 1024; @@ -87,37 +85,22 @@ TYPED_TEST(RocprimCountingIteratorTests, Transform) Iterator input_begin(test_utils::get_random_value(0, 200)); std::vector output(size); - T * d_output; + T* d_output; HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(T))); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host std::vector expected(size); - std::transform( - input_begin, - input_begin + size, - expected.begin(), - transform() - ); + std::transform(input_begin, input_begin + size, expected.begin(), transform()); // Run HIP_CHECK( - rocprim::transform( - input_begin, d_output, size, - transform(), stream, debug_synchronous - ) - ); + rocprim::transform(input_begin, d_output, size, transform(), stream, debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), d_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Validating results diff --git a/test/rocprim/test_device_binary_search.cpp b/test/rocprim/test_device_binary_search.cpp index 688d7f201..dc5cb9afe 100644 --- a/test/rocprim/test_device_binary_search.cpp +++ b/test/rocprim/test_device_binary_search.cpp @@ -24,8 +24,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -39,43 +39,43 @@ #define HIP_CHECK(error) ASSERT_EQ(error, hipSuccess) -template< - class Haystack, - class Needle, - class Output = size_t, - class CompareFunction = rocprim::less<> -> +template > struct params { - using haystack_type = Haystack; - using needle_type = Needle; - using output_type = Output; + using haystack_type = Haystack; + using needle_type = Needle; + using output_type = Output; using compare_op_type = CompareFunction; }; -template -class RocprimDeviceBinarySearch : public ::testing::Test { +template +class RocprimDeviceBinarySearch : public ::testing::Test +{ public: using params = Params; }; -using custom_int2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< params, - params >, - params >, + params>, + params>, params, params, - params > -> Params; + params>> + Params; TYPED_TEST_CASE(RocprimDeviceBinarySearch, Params); std::vector get_sizes() { - std::vector sizes = { 1, 10, 53, 211, 1024, 2345, 4096, 34567, (1 << 16) - 1220, (1 << 22) - 76543 }; + std::vector sizes + = {1, 10, 53, 211, 1024, 2345, 4096, 34567, (1 << 16) - 1220, (1 << 22) - 76543}; const std::vector random_sizes = test_utils::get_random_data(5, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); return sizes; @@ -83,9 +83,9 @@ std::vector get_sizes() TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) { - using haystack_type = typename TestFixture::params::haystack_type; - using needle_type = typename TestFixture::params::needle_type; - using output_type = typename TestFixture::params::output_type; + using haystack_type = typename TestFixture::params::haystack_type; + using needle_type = typename TestFixture::params::needle_type; + using output_type = typename TestFixture::params::output_type; using compare_op_type = typename TestFixture::params::compare_op_type; hipStream_t stream = 0; @@ -99,84 +99,70 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) SCOPED_TRACE(testing::Message() << "with size = " << size); const size_t haystack_size = size; - const size_t needles_size = std::sqrt(size); - const size_t d = haystack_size / 100; + const size_t needles_size = std::sqrt(size); + const size_t d = haystack_size / 100; // Generate data - std::vector haystack = test_utils::get_random_data( - haystack_size, 0, haystack_size + 2 * d - ); + std::vector haystack + = test_utils::get_random_data(haystack_size, 0, haystack_size + 2 * d); std::sort(haystack.begin(), haystack.end(), compare_op); // Use a narrower range for needles for checking out-of-haystack cases - std::vector needles = test_utils::get_random_data( - needles_size, d, haystack_size + d - ); + std::vector needles + = test_utils::get_random_data(needles_size, d, haystack_size + d); - haystack_type * d_haystack; - needle_type * d_needles; - output_type * d_output; + haystack_type* d_haystack; + needle_type* d_needles; + output_type* d_output; HIP_CHECK(hipMalloc(&d_haystack, haystack_size * sizeof(haystack_type))); HIP_CHECK(hipMalloc(&d_needles, needles_size * sizeof(needle_type))); HIP_CHECK(hipMalloc(&d_output, needles_size * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_haystack, haystack.data(), - haystack_size * sizeof(haystack_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_needles, needles.data(), - needles_size * sizeof(needle_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_haystack, + haystack.data(), + haystack_size * sizeof(haystack_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_needles, needles.data(), needles_size * sizeof(needle_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector expected(needles_size); for(size_t i = 0; i < needles_size; i++) { - expected[i] = - std::lower_bound(haystack.begin(), haystack.end(), needles[i], compare_op) - - haystack.begin(); + expected[i] = std::lower_bound(haystack.begin(), haystack.end(), needles[i], compare_op) + - haystack.begin(); } - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK( - rocprim::lower_bound( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::lower_bound(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::lower_bound( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::lower_bound(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream, + debug_synchronous)); std::vector output(needles_size); - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - needles_size * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, needles_size * sizeof(output_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_haystack)); @@ -189,9 +175,9 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) { - using haystack_type = typename TestFixture::params::haystack_type; - using needle_type = typename TestFixture::params::needle_type; - using output_type = typename TestFixture::params::output_type; + using haystack_type = typename TestFixture::params::haystack_type; + using needle_type = typename TestFixture::params::needle_type; + using output_type = typename TestFixture::params::output_type; using compare_op_type = typename TestFixture::params::compare_op_type; hipStream_t stream = 0; @@ -205,84 +191,70 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) SCOPED_TRACE(testing::Message() << "with size = " << size); const size_t haystack_size = size; - const size_t needles_size = std::sqrt(size); - const size_t d = haystack_size / 100; + const size_t needles_size = std::sqrt(size); + const size_t d = haystack_size / 100; // Generate data - std::vector haystack = test_utils::get_random_data( - haystack_size, 0, haystack_size + 2 * d - ); + std::vector haystack + = test_utils::get_random_data(haystack_size, 0, haystack_size + 2 * d); std::sort(haystack.begin(), haystack.end(), compare_op); // Use a narrower range for needles for checking out-of-haystack cases - std::vector needles = test_utils::get_random_data( - needles_size, d, haystack_size + d - ); + std::vector needles + = test_utils::get_random_data(needles_size, d, haystack_size + d); - haystack_type * d_haystack; - needle_type * d_needles; - output_type * d_output; + haystack_type* d_haystack; + needle_type* d_needles; + output_type* d_output; HIP_CHECK(hipMalloc(&d_haystack, haystack_size * sizeof(haystack_type))); HIP_CHECK(hipMalloc(&d_needles, needles_size * sizeof(needle_type))); HIP_CHECK(hipMalloc(&d_output, needles_size * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_haystack, haystack.data(), - haystack_size * sizeof(haystack_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_needles, needles.data(), - needles_size * sizeof(needle_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_haystack, + haystack.data(), + haystack_size * sizeof(haystack_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_needles, needles.data(), needles_size * sizeof(needle_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector expected(needles_size); for(size_t i = 0; i < needles_size; i++) { - expected[i] = - std::upper_bound(haystack.begin(), haystack.end(), needles[i], compare_op) - - haystack.begin(); + expected[i] = std::upper_bound(haystack.begin(), haystack.end(), needles[i], compare_op) + - haystack.begin(); } - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK( - rocprim::upper_bound( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::upper_bound(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::upper_bound( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::upper_bound(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream, + debug_synchronous)); std::vector output(needles_size); - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - needles_size * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, needles_size * sizeof(output_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_haystack)); @@ -295,9 +267,9 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch) { - using haystack_type = typename TestFixture::params::haystack_type; - using needle_type = typename TestFixture::params::needle_type; - using output_type = typename TestFixture::params::output_type; + using haystack_type = typename TestFixture::params::haystack_type; + using needle_type = typename TestFixture::params::needle_type; + using output_type = typename TestFixture::params::output_type; using compare_op_type = typename TestFixture::params::compare_op_type; hipStream_t stream = 0; @@ -311,82 +283,70 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch) SCOPED_TRACE(testing::Message() << "with size = " << size); const size_t haystack_size = size; - const size_t needles_size = std::sqrt(size); - const size_t d = haystack_size / 100; + const size_t needles_size = std::sqrt(size); + const size_t d = haystack_size / 100; // Generate data - std::vector haystack = test_utils::get_random_data( - haystack_size, 0, haystack_size + 2 * d - ); + std::vector haystack + = test_utils::get_random_data(haystack_size, 0, haystack_size + 2 * d); std::sort(haystack.begin(), haystack.end(), compare_op); // Use a narrower range for needles for checking out-of-haystack cases - std::vector needles = test_utils::get_random_data( - needles_size, d, haystack_size + d - ); + std::vector needles + = test_utils::get_random_data(needles_size, d, haystack_size + d); - haystack_type * d_haystack; - needle_type * d_needles; - output_type * d_output; + haystack_type* d_haystack; + needle_type* d_needles; + output_type* d_output; HIP_CHECK(hipMalloc(&d_haystack, haystack_size * sizeof(haystack_type))); HIP_CHECK(hipMalloc(&d_needles, needles_size * sizeof(needle_type))); HIP_CHECK(hipMalloc(&d_output, needles_size * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_haystack, haystack.data(), - haystack_size * sizeof(haystack_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_needles, needles.data(), - needles_size * sizeof(needle_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_haystack, + haystack.data(), + haystack_size * sizeof(haystack_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_needles, needles.data(), needles_size * sizeof(needle_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector expected(needles_size); for(size_t i = 0; i < needles_size; i++) { - expected[i] = std::binary_search(haystack.begin(), haystack.end(), needles[i], compare_op); + expected[i] + = std::binary_search(haystack.begin(), haystack.end(), needles[i], compare_op); } - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK( - rocprim::binary_search( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::binary_search(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::binary_search( - d_temporary_storage, temporary_storage_bytes, - d_haystack, d_needles, d_output, - haystack_size, needles_size, - compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::binary_search(d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream, + debug_synchronous)); std::vector output(needles_size); - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - needles_size * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, needles_size * sizeof(output_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_haystack)); diff --git a/test/rocprim/test_device_histogram.cpp b/test/rocprim/test_device_histogram.cpp index a924be7f4..b7ee419d6 100644 --- a/test/rocprim/test_device_histogram.cpp +++ b/test/rocprim/test_device_histogram.cpp @@ -23,10 +23,10 @@ #include #include #include -#include #include -#include +#include #include +#include // Google Test #include @@ -72,108 +72,101 @@ std::vector> get_dims() // Generate values ouside the desired histogram range (+-10%) // (correctly handling test cases like uchar [0, 256), ushort [0, 65536)) -template -inline auto get_random_samples(size_t size, U min, U max) - -> typename std::enable_if::value, std::vector>::type +template +inline auto get_random_samples(size_t size, U min, U max) -> + typename std::enable_if::value, std::vector>::type { const long long min1 = static_cast(min); const long long max1 = static_cast(max); - const long long d = max1 - min1; + const long long d = max1 - min1; return test_utils::get_random_data( size, - static_cast(std::max(min1 - d / 10, static_cast(std::numeric_limits::lowest()))), - static_cast(std::min(max1 + d / 10, static_cast(std::numeric_limits::max()))) - ); + static_cast( + std::max(min1 - d / 10, static_cast(std::numeric_limits::lowest()))), + static_cast( + std::min(max1 + d / 10, static_cast(std::numeric_limits::max())))); } -template -inline auto get_random_samples(size_t size, U min, U max) - -> typename std::enable_if::value, std::vector>::type +template +inline auto get_random_samples(size_t size, U min, U max) -> + typename std::enable_if::value, std::vector>::type { const double min1 = static_cast(min); const double max1 = static_cast(max); - const double d = max1 - min1; + const double d = max1 - min1; return test_utils::get_random_data( size, - static_cast(std::max(min1 - d / 10, static_cast(std::numeric_limits::lowest()))), - static_cast(std::min(max1 + d / 10, static_cast(std::numeric_limits::max()))) - ); + static_cast( + std::max(min1 - d / 10, static_cast(std::numeric_limits::lowest()))), + static_cast( + std::min(max1 + d / 10, static_cast(std::numeric_limits::max())))); } // Does nothing, used for testing iterators (not raw pointers) as samples input -template +template struct transform_op { - __host__ __device__ inline - T operator()(T x) const + __host__ __device__ inline T operator()(T x) const { return x * 1; } }; -template< - class SampleType, - unsigned int Bins, - int LowerLevel, - int UpperLevel, - class LevelType = SampleType, - class CounterType = int -> +template struct params1 { - using sample_type = SampleType; - static constexpr unsigned int bins = Bins; - static constexpr int lower_level = LowerLevel; - static constexpr int upper_level = UpperLevel; - using level_type = LevelType; - using counter_type = CounterType; + using sample_type = SampleType; + static constexpr unsigned int bins = Bins; + static constexpr int lower_level = LowerLevel; + static constexpr int upper_level = UpperLevel; + using level_type = LevelType; + using counter_type = CounterType; }; -template -class RocprimDeviceHistogramEven : public ::testing::Test { +template +class RocprimDeviceHistogramEven : public ::testing::Test +{ public: using params = Params; }; -typedef ::testing::Types< - params1, - params1, - params1, - params1, - params1, - params1, +typedef ::testing::Types, + params1, + params1, + params1, + params1, + params1, - params1, - params1, - params1 -> Params1; + params1, + params1, + params1> + Params1; TYPED_TEST_CASE(RocprimDeviceHistogramEven, Params1); TEST(RocprimDeviceHistogramEven, IncorrectInput) { size_t temporary_storage_bytes = 0; - int * d_input = nullptr; - int * d_histogram = nullptr; + int* d_input = nullptr; + int* d_histogram = nullptr; ASSERT_EQ( - rp::histogram_even( - nullptr, temporary_storage_bytes, - d_input, 123, - d_histogram, - 1, 1, 2 - ), - hipErrorInvalidValue - ); + rp::histogram_even(nullptr, temporary_storage_bytes, d_input, 123, d_histogram, 1, 1, 2), + hipErrorInvalidValue); } TYPED_TEST(RocprimDeviceHistogramEven, Even) { - using sample_type = typename TestFixture::params::sample_type; - using counter_type = typename TestFixture::params::counter_type; - using level_type = typename TestFixture::params::level_type; - constexpr unsigned int bins = TestFixture::params::bins; - constexpr level_type lower_level = TestFixture::params::lower_level; - constexpr level_type upper_level = TestFixture::params::upper_level; + using sample_type = typename TestFixture::params::sample_type; + using counter_type = typename TestFixture::params::counter_type; + using level_type = typename TestFixture::params::level_type; + constexpr unsigned int bins = TestFixture::params::bins; + constexpr level_type lower_level = TestFixture::params::lower_level; + constexpr level_type upper_level = TestFixture::params::upper_level; hipStream_t stream = 0; @@ -181,42 +174,36 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) for(auto dim : get_dims()) { - SCOPED_TRACE( - testing::Message() << "with dim = {" << - std::get<0>(dim) << ", " << std::get<1>(dim) << ", " << std::get<2>(dim) << "}" - ); + SCOPED_TRACE(testing::Message() << "with dim = {" << std::get<0>(dim) << ", " + << std::get<1>(dim) << ", " << std::get<2>(dim) << "}"); - const size_t rows = std::get<0>(dim); - const size_t columns = std::get<1>(dim); + const size_t rows = std::get<0>(dim); + const size_t columns = std::get<1>(dim); const size_t row_stride = columns + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = std::max(1, rows * row_stride); // Generate data - std::vector input = get_random_samples(size, lower_level, upper_level); + std::vector input + = get_random_samples(size, lower_level, upper_level); - sample_type * d_input; - counter_type * d_histogram; + sample_type* d_input; + counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(sample_type))); HIP_CHECK(hipMalloc(&d_histogram, bins * sizeof(counter_type))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(sample_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), size * sizeof(sample_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector histogram_expected(bins, 0); - const level_type scale = (upper_level - lower_level) / bins; + const level_type scale = (upper_level - lower_level) / bins; for(size_t row = 0; row < rows; row++) { for(size_t column = 0; column < columns; column++) { const sample_type sample = input[row * row_stride + column]; - const level_type s = static_cast(sample); + const level_type s = static_cast(sample); if(s >= lower_level && s < upper_level) { const int bin = (s - lower_level) / scale; @@ -230,67 +217,70 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) size_t temporary_storage_bytes = 0; if(rows == 1) { - HIP_CHECK( - rp::histogram_even( - nullptr, temporary_storage_bytes, - d_input, columns, - d_histogram, - bins + 1, lower_level, upper_level, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_even(nullptr, + temporary_storage_bytes, + d_input, + columns, + d_histogram, + bins + 1, + lower_level, + upper_level, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::histogram_even( - nullptr, temporary_storage_bytes, - d_input, columns, rows, row_stride_bytes, - d_histogram, - bins + 1, lower_level, upper_level, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_even(nullptr, + temporary_storage_bytes, + d_input, + columns, + rows, + row_stride_bytes, + d_histogram, + bins + 1, + lower_level, + upper_level, + stream, + debug_synchronous)); } ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(rows == 1) { - HIP_CHECK( - rp::histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, columns, - d_histogram, - bins + 1, lower_level, upper_level, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + columns, + d_histogram, + bins + 1, + lower_level, + upper_level, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input, columns, rows, row_stride_bytes, - d_histogram, - bins + 1, lower_level, upper_level, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input, + columns, + rows, + row_stride_bytes, + d_histogram, + bins + 1, + lower_level, + upper_level, + stream, + debug_synchronous)); } std::vector histogram(bins); - HIP_CHECK( - hipMemcpy( - histogram.data(), d_histogram, - bins * sizeof(counter_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + histogram.data(), d_histogram, bins * sizeof(counter_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); @@ -303,127 +293,106 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) } } -template< - class SampleType, - unsigned int Bins, - int StartLevel = 0, - unsigned int MinBinWidth = 1, - unsigned int MaxBinWidth = 10, - class LevelType = SampleType, - class CounterType = int -> +template struct params2 { - using sample_type = SampleType; - static constexpr unsigned int bins = Bins; - static constexpr int start_level = StartLevel; + using sample_type = SampleType; + static constexpr unsigned int bins = Bins; + static constexpr int start_level = StartLevel; static constexpr unsigned int min_bin_length = MinBinWidth; static constexpr unsigned int max_bin_length = MaxBinWidth; - using level_type = LevelType; - using counter_type = CounterType; + using level_type = LevelType; + using counter_type = CounterType; }; -template -class RocprimDeviceHistogramRange : public ::testing::Test { +template +class RocprimDeviceHistogramRange : public ::testing::Test +{ public: using params = Params; }; -typedef ::testing::Types< - params2, - params2, - params2, - params2, - params2, +typedef ::testing::Types, + params2, + params2, + params2, + params2, - params2, - params2 -> Params2; + params2, + params2> + Params2; TYPED_TEST_CASE(RocprimDeviceHistogramRange, Params2); TEST(RocprimDeviceHistogramRange, IncorrectInput) { size_t temporary_storage_bytes = 0; - int * d_input = nullptr; - int * d_histogram = nullptr; - int * d_levels = nullptr; - ASSERT_EQ( - rp::histogram_range( - nullptr, temporary_storage_bytes, - d_input, 123, - d_histogram, - 1, d_levels - ), - hipErrorInvalidValue - ); + int* d_input = nullptr; + int* d_histogram = nullptr; + int* d_levels = nullptr; + ASSERT_EQ(rp::histogram_range( + nullptr, temporary_storage_bytes, d_input, 123, d_histogram, 1, d_levels), + hipErrorInvalidValue); } TYPED_TEST(RocprimDeviceHistogramRange, Range) { - using sample_type = typename TestFixture::params::sample_type; - using counter_type = typename TestFixture::params::counter_type; - using level_type = typename TestFixture::params::level_type; + using sample_type = typename TestFixture::params::sample_type; + using counter_type = typename TestFixture::params::counter_type; + using level_type = typename TestFixture::params::level_type; constexpr unsigned int bins = TestFixture::params::bins; hipStream_t stream = 0; const bool debug_synchronous = false; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); - std::uniform_int_distribution bin_length_dis( - TestFixture::params::min_bin_length, - TestFixture::params::max_bin_length - ); + std::uniform_int_distribution bin_length_dis(TestFixture::params::min_bin_length, + TestFixture::params::max_bin_length); for(auto dim : get_dims()) { - SCOPED_TRACE( - testing::Message() << "with dim = {" << - std::get<0>(dim) << ", " << std::get<1>(dim) << ", " << std::get<2>(dim) << "}" - ); + SCOPED_TRACE(testing::Message() << "with dim = {" << std::get<0>(dim) << ", " + << std::get<1>(dim) << ", " << std::get<2>(dim) << "}"); - const size_t rows = std::get<0>(dim); - const size_t columns = std::get<1>(dim); + const size_t rows = std::get<0>(dim); + const size_t columns = std::get<1>(dim); const size_t row_stride = columns + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = std::max(1, rows * row_stride); // Generate data std::vector levels; - level_type level = TestFixture::params::start_level; - for(unsigned int bin = 0 ; bin < bins; bin++) + level_type level = TestFixture::params::start_level; + for(unsigned int bin = 0; bin < bins; bin++) { levels.push_back(level); level += bin_length_dis(gen); } levels.push_back(level); - std::vector input = get_random_samples(size, levels[0], levels[bins]); + std::vector input + = get_random_samples(size, levels[0], levels[bins]); - sample_type * d_input; - level_type * d_levels; - counter_type * d_histogram; + sample_type* d_input; + level_type* d_levels; + counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(sample_type))); HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(level_type))); HIP_CHECK(hipMalloc(&d_histogram, bins * sizeof(counter_type))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(sample_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_levels, levels.data(), - (bins + 1) * sizeof(level_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), size * sizeof(sample_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_levels, levels.data(), (bins + 1) * sizeof(level_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector histogram_expected(bins, 0); @@ -432,7 +401,7 @@ TYPED_TEST(RocprimDeviceHistogramRange, Range) for(size_t column = 0; column < columns; column++) { const sample_type sample = input[row * row_stride + column]; - const level_type s = static_cast(sample); + const level_type s = static_cast(sample); if(s >= levels[0] && s < levels[bins]) { const auto bin_iter = std::upper_bound(levels.begin(), levels.end(), s); @@ -441,75 +410,72 @@ TYPED_TEST(RocprimDeviceHistogramRange, Range) } } - rp::transform_iterator, sample_type> d_input2( - d_input, - transform_op() - ); + rp::transform_iterator, sample_type> d_input2( + d_input, transform_op()); size_t temporary_storage_bytes = 0; if(rows == 1) { - HIP_CHECK( - rp::histogram_range( - nullptr, temporary_storage_bytes, - d_input2, columns, - d_histogram, - bins + 1, d_levels, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_range(nullptr, + temporary_storage_bytes, + d_input2, + columns, + d_histogram, + bins + 1, + d_levels, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::histogram_range( - nullptr, temporary_storage_bytes, - d_input2, columns, rows, row_stride_bytes, - d_histogram, - bins + 1, d_levels, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_range(nullptr, + temporary_storage_bytes, + d_input2, + columns, + rows, + row_stride_bytes, + d_histogram, + bins + 1, + d_levels, + stream, + debug_synchronous)); } ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(rows == 1) { - HIP_CHECK( - rp::histogram_range( - d_temporary_storage, temporary_storage_bytes, - d_input2, columns, - d_histogram, - bins + 1, d_levels, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_range(d_temporary_storage, + temporary_storage_bytes, + d_input2, + columns, + d_histogram, + bins + 1, + d_levels, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::histogram_range( - d_temporary_storage, temporary_storage_bytes, - d_input2, columns, rows, row_stride_bytes, - d_histogram, - bins + 1, d_levels, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::histogram_range(d_temporary_storage, + temporary_storage_bytes, + d_input2, + columns, + rows, + row_stride_bytes, + d_histogram, + bins + 1, + d_levels, + stream, + debug_synchronous)); } std::vector histogram(bins); - HIP_CHECK( - hipMemcpy( - histogram.data(), d_histogram, - bins * sizeof(counter_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + histogram.data(), d_histogram, bins * sizeof(counter_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); @@ -523,72 +489,71 @@ TYPED_TEST(RocprimDeviceHistogramRange, Range) } } -template< - class SampleType, - unsigned int Channels, - unsigned int ActiveChannels, - unsigned int Bins, - int LowerLevel, - int UpperLevel, - class LevelType = SampleType, - class CounterType = int -> +template struct params3 { - using sample_type = SampleType; - static constexpr unsigned int channels = Channels; + using sample_type = SampleType; + static constexpr unsigned int channels = Channels; static constexpr unsigned int active_channels = ActiveChannels; - static constexpr unsigned int bins = Bins; - static constexpr int lower_level = LowerLevel; - static constexpr int upper_level = UpperLevel; - using level_type = LevelType; - using counter_type = CounterType; + static constexpr unsigned int bins = Bins; + static constexpr int lower_level = LowerLevel; + static constexpr int upper_level = UpperLevel; + using level_type = LevelType; + using counter_type = CounterType; }; -template -class RocprimDeviceHistogramMultiEven : public ::testing::Test { +template +class RocprimDeviceHistogramMultiEven : public ::testing::Test +{ public: using params = Params; }; -typedef ::testing::Types< - params3, - params3, - params3, - params3, - params3, - params3, - params3, +typedef ::testing::Types, + params3, + params3, + params3, + params3, + params3, + params3, - params3, - params3, - params3 -> Params3; + params3, + params3, + params3> + Params3; TYPED_TEST_CASE(RocprimDeviceHistogramMultiEven, Params3); TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) { - using sample_type = typename TestFixture::params::sample_type; - using counter_type = typename TestFixture::params::counter_type; - using level_type = typename TestFixture::params::level_type; - constexpr unsigned int channels = TestFixture::params::channels; + using sample_type = typename TestFixture::params::sample_type; + using counter_type = typename TestFixture::params::counter_type; + using level_type = typename TestFixture::params::level_type; + constexpr unsigned int channels = TestFixture::params::channels; constexpr unsigned int active_channels = TestFixture::params::active_channels; unsigned int bins[active_channels]; unsigned int num_levels[active_channels]; - level_type lower_level[active_channels]; - level_type upper_level[active_channels]; + level_type lower_level[active_channels]; + level_type upper_level[active_channels]; for(unsigned int channel = 0; channel < active_channels; channel++) { // Use different ranges for different channels - constexpr level_type d = TestFixture::params::upper_level - TestFixture::params::lower_level; + constexpr level_type d + = TestFixture::params::upper_level - TestFixture::params::lower_level; const level_type scale = d / TestFixture::params::bins; - lower_level[channel] = TestFixture::params::lower_level + channel * d / 9; - upper_level[channel] = TestFixture::params::upper_level - channel * d / 7; - bins[channel] = (upper_level[channel] - lower_level[channel]) / scale; - upper_level[channel] = lower_level[channel] + bins[channel] * scale; - num_levels[channel] = bins[channel] + 1; + lower_level[channel] = TestFixture::params::lower_level + channel * d / 9; + upper_level[channel] = TestFixture::params::upper_level - channel * d / 7; + bins[channel] = (upper_level[channel] - lower_level[channel]) / scale; + upper_level[channel] = lower_level[channel] + bins[channel] * scale; + num_levels[channel] = bins[channel] + 1; } hipStream_t stream = 0; @@ -597,33 +562,33 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) for(auto dim : get_dims()) { - SCOPED_TRACE( - testing::Message() << "with dim = {" << - std::get<0>(dim) << ", " << std::get<1>(dim) << ", " << std::get<2>(dim) << "}" - ); + SCOPED_TRACE(testing::Message() << "with dim = {" << std::get<0>(dim) << ", " + << std::get<1>(dim) << ", " << std::get<2>(dim) << "}"); - const size_t rows = std::get<0>(dim); - const size_t columns = std::get<1>(dim); + const size_t rows = std::get<0>(dim); + const size_t columns = std::get<1>(dim); const size_t row_stride = columns * channels + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = std::max(1, rows * row_stride); // Generate data std::vector input(size); for(unsigned int channel = 0; channel < channels; channel++) { const size_t gen_columns = (row_stride + channels - 1) / channels; - const size_t gen_size = rows * gen_columns; + const size_t gen_size = rows * gen_columns; std::vector channel_input; if(channel < active_channels) { - channel_input = get_random_samples(gen_size, lower_level[channel], upper_level[channel]); + channel_input = get_random_samples( + gen_size, lower_level[channel], upper_level[channel]); } else { - channel_input = get_random_samples(gen_size, lower_level[0], upper_level[0]); + channel_input + = get_random_samples(gen_size, lower_level[0], upper_level[0]); } // Interleave values for(size_t row = 0; row < rows; row++) @@ -639,20 +604,15 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) } } - sample_type * d_input; - counter_type * d_histogram[active_channels]; + sample_type* d_input; + counter_type* d_histogram[active_channels]; HIP_CHECK(hipMalloc(&d_input, size * sizeof(sample_type))); for(unsigned int channel = 0; channel < active_channels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins[channel] * sizeof(counter_type))); } HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(sample_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), size * sizeof(sample_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector histogram_expected[active_channels]; @@ -665,7 +625,8 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) { for(size_t column = 0; column < columns; column++) { - const sample_type sample = input[row * row_stride + column * channels + channel]; + const sample_type sample + = input[row * row_stride + column * channels + channel]; const level_type s = static_cast(sample); if(s >= lower_level[channel] && s < upper_level[channel]) { @@ -676,78 +637,81 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) } } - rp::transform_iterator, sample_type> d_input2( - d_input, - transform_op() - ); + rp::transform_iterator, sample_type> d_input2( + d_input, transform_op()); size_t temporary_storage_bytes = 0; if(rows == 1) { - HIP_CHECK(( - rp::multi_histogram_even( - nullptr, temporary_storage_bytes, - d_input2, columns, - d_histogram, - num_levels, lower_level, upper_level, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_even(nullptr, + temporary_storage_bytes, + d_input2, + columns, + d_histogram, + num_levels, + lower_level, + upper_level, + stream, + debug_synchronous))); } else { - HIP_CHECK(( - rp::multi_histogram_even( - nullptr, temporary_storage_bytes, - d_input2, columns, rows, row_stride_bytes, - d_histogram, - num_levels, lower_level, upper_level, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_even(nullptr, + temporary_storage_bytes, + d_input2, + columns, + rows, + row_stride_bytes, + d_histogram, + num_levels, + lower_level, + upper_level, + stream, + debug_synchronous))); } ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(rows == 1) { - HIP_CHECK(( - rp::multi_histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input2, columns, - d_histogram, - num_levels, lower_level, upper_level, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input2, + columns, + d_histogram, + num_levels, + lower_level, + upper_level, + stream, + debug_synchronous))); } else { - HIP_CHECK(( - rp::multi_histogram_even( - d_temporary_storage, temporary_storage_bytes, - d_input2, columns, rows, row_stride_bytes, - d_histogram, - num_levels, lower_level, upper_level, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, + temporary_storage_bytes, + d_input2, + columns, + rows, + row_stride_bytes, + d_histogram, + num_levels, + lower_level, + upper_level, + stream, + debug_synchronous))); } std::vector histogram[active_channels]; for(unsigned int channel = 0; channel < active_channels; channel++) { histogram[channel] = std::vector(bins[channel]); - HIP_CHECK( - hipMemcpy( - histogram[channel].data(), d_histogram[channel], - bins[channel] * sizeof(counter_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(histogram[channel].data(), + d_histogram[channel], + bins[channel] * sizeof(counter_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_histogram[channel])); } @@ -766,98 +730,92 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) } } -template< - class SampleType, - unsigned int Channels, - unsigned int ActiveChannels, - unsigned int Bins, - int StartLevel = 0, - unsigned int MinBinWidth = 1, - unsigned int MaxBinWidth = 10, - class LevelType = SampleType, - class CounterType = int -> +template struct params4 { - using sample_type = SampleType; - static constexpr unsigned int channels = Channels; + using sample_type = SampleType; + static constexpr unsigned int channels = Channels; static constexpr unsigned int active_channels = ActiveChannels; - static constexpr unsigned int bins = Bins; - static constexpr int start_level = StartLevel; - static constexpr unsigned int min_bin_length = MinBinWidth; - static constexpr unsigned int max_bin_length = MaxBinWidth; - using level_type = LevelType; - using counter_type = CounterType; + static constexpr unsigned int bins = Bins; + static constexpr int start_level = StartLevel; + static constexpr unsigned int min_bin_length = MinBinWidth; + static constexpr unsigned int max_bin_length = MaxBinWidth; + using level_type = LevelType; + using counter_type = CounterType; }; -template -class RocprimDeviceHistogramMultiRange : public ::testing::Test { +template +class RocprimDeviceHistogramMultiRange : public ::testing::Test +{ public: using params = Params; }; -typedef ::testing::Types< - params4, - params4, - params4, - params4, - params4, +typedef ::testing::Types, + params4, + params4, + params4, + params4, - params4, - params4 -> Params4; + params4, + params4> + Params4; TYPED_TEST_CASE(RocprimDeviceHistogramMultiRange, Params4); TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) { - using sample_type = typename TestFixture::params::sample_type; - using counter_type = typename TestFixture::params::counter_type; - using level_type = typename TestFixture::params::level_type; - constexpr unsigned int channels = TestFixture::params::channels; + using sample_type = typename TestFixture::params::sample_type; + using counter_type = typename TestFixture::params::counter_type; + using level_type = typename TestFixture::params::level_type; + constexpr unsigned int channels = TestFixture::params::channels; constexpr unsigned int active_channels = TestFixture::params::active_channels; hipStream_t stream = 0; const bool debug_synchronous = false; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); - unsigned int bins[active_channels]; - unsigned int num_levels[active_channels]; + unsigned int bins[active_channels]; + unsigned int num_levels[active_channels]; std::uniform_int_distribution bin_length_dis[active_channels]; for(unsigned int channel = 0; channel < active_channels; channel++) { // Use different ranges for different channels - bins[channel] = TestFixture::params::bins + channel; - num_levels[channel] = bins[channel] + 1; + bins[channel] = TestFixture::params::bins + channel; + num_levels[channel] = bins[channel] + 1; bin_length_dis[channel] = std::uniform_int_distribution( - TestFixture::params::min_bin_length, - TestFixture::params::max_bin_length - ); + TestFixture::params::min_bin_length, TestFixture::params::max_bin_length); } for(auto dim : get_dims()) { - SCOPED_TRACE( - testing::Message() << "with dim = {" << - std::get<0>(dim) << ", " << std::get<1>(dim) << ", " << std::get<2>(dim) << "}" - ); + SCOPED_TRACE(testing::Message() << "with dim = {" << std::get<0>(dim) << ", " + << std::get<1>(dim) << ", " << std::get<2>(dim) << "}"); - const size_t rows = std::get<0>(dim); - const size_t columns = std::get<1>(dim); + const size_t rows = std::get<0>(dim); + const size_t columns = std::get<1>(dim); const size_t row_stride = columns * channels + std::get<2>(dim); const size_t row_stride_bytes = row_stride * sizeof(sample_type); - const size_t size = std::max(1, rows * row_stride); + const size_t size = std::max(1, rows * row_stride); // Generate data std::vector levels[active_channels]; for(unsigned int channel = 0; channel < active_channels; channel++) { level_type level = TestFixture::params::start_level; - for(unsigned int bin = 0 ; bin < bins[channel]; bin++) + for(unsigned int bin = 0; bin < bins[channel]; bin++) { levels[channel].push_back(level); level += bin_length_dis[channel](gen); @@ -869,18 +827,18 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) for(unsigned int channel = 0; channel < channels; channel++) { const size_t gen_columns = (row_stride + channels - 1) / channels; - const size_t gen_size = rows * gen_columns; + const size_t gen_size = rows * gen_columns; std::vector channel_input; if(channel < active_channels) { channel_input = get_random_samples( - gen_size, levels[channel][0], levels[channel][bins[channel]] - ); + gen_size, levels[channel][0], levels[channel][bins[channel]]); } else { - channel_input = get_random_samples(gen_size, levels[0][0], levels[0][bins[0]]); + channel_input + = get_random_samples(gen_size, levels[0][0], levels[0][bins[0]]); } // Interleave values for(size_t row = 0; row < rows; row++) @@ -896,9 +854,9 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) } } - sample_type * d_input; - level_type * d_levels[active_channels]; - counter_type * d_histogram[active_channels]; + sample_type* d_input; + level_type* d_levels[active_channels]; + counter_type* d_histogram[active_channels]; HIP_CHECK(hipMalloc(&d_input, size * sizeof(sample_type))); for(unsigned int channel = 0; channel < active_channels; channel++) { @@ -906,21 +864,13 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) HIP_CHECK(hipMalloc(&d_histogram[channel], bins[channel] * sizeof(counter_type))); } HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(sample_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), size * sizeof(sample_type), hipMemcpyHostToDevice)); for(unsigned int channel = 0; channel < active_channels; channel++) { - HIP_CHECK( - hipMemcpy( - d_levels[channel], levels[channel].data(), - num_levels[channel] * sizeof(level_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_levels[channel], + levels[channel].data(), + num_levels[channel] * sizeof(level_type), + hipMemcpyHostToDevice)); } // Calculate expected results on host @@ -933,11 +883,13 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) { for(size_t column = 0; column < columns; column++) { - const sample_type sample = input[row * row_stride + column * channels + channel]; + const sample_type sample + = input[row * row_stride + column * channels + channel]; const level_type s = static_cast(sample); if(s >= levels[channel][0] && s < levels[channel][bins[channel]]) { - const auto bin_iter = std::upper_bound(levels[channel].begin(), levels[channel].end(), s); + const auto bin_iter + = std::upper_bound(levels[channel].begin(), levels[channel].end(), s); const int bin = bin_iter - levels[channel].begin() - 1; histogram_expected[channel][bin]++; } @@ -950,70 +902,75 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) size_t temporary_storage_bytes = 0; if(rows == 1) { - HIP_CHECK(( - rp::multi_histogram_range( - nullptr, temporary_storage_bytes, - d_input, columns, - d_histogram, - num_levels, d_levels, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_range( + nullptr, + temporary_storage_bytes, + d_input, + columns, + d_histogram, + num_levels, + d_levels, + stream, + debug_synchronous))); } else { - HIP_CHECK(( - rp::multi_histogram_range( - nullptr, temporary_storage_bytes, - d_input, columns, rows, row_stride_bytes, - d_histogram, - num_levels, d_levels, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_range( + nullptr, + temporary_storage_bytes, + d_input, + columns, + rows, + row_stride_bytes, + d_histogram, + num_levels, + d_levels, + stream, + debug_synchronous))); } ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(rows == 1) { - HIP_CHECK(( - rp::multi_histogram_range( - d_temporary_storage, temporary_storage_bytes, - d_input, columns, - d_histogram, - num_levels, d_levels, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_range( + d_temporary_storage, + temporary_storage_bytes, + d_input, + columns, + d_histogram, + num_levels, + d_levels, + stream, + debug_synchronous))); } else { - HIP_CHECK(( - rp::multi_histogram_range( - d_temporary_storage, temporary_storage_bytes, - d_input, columns, rows, row_stride_bytes, - d_histogram, - num_levels, d_levels, - stream, debug_synchronous - ) - )); + HIP_CHECK((rp::multi_histogram_range( + d_temporary_storage, + temporary_storage_bytes, + d_input, + columns, + rows, + row_stride_bytes, + d_histogram, + num_levels, + d_levels, + stream, + debug_synchronous))); } std::vector histogram[active_channels]; for(unsigned int channel = 0; channel < active_channels; channel++) { histogram[channel] = std::vector(bins[channel]); - HIP_CHECK( - hipMemcpy( - histogram[channel].data(), d_histogram[channel], - bins[channel] * sizeof(counter_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(histogram[channel].data(), + d_histogram[channel], + bins[channel] * sizeof(counter_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_levels[channel])); HIP_CHECK(hipFree(d_histogram[channel])); } diff --git a/test/rocprim/test_device_merge.cpp b/test/rocprim/test_device_merge.cpp index 302d04e10..a4e36ed19 100644 --- a/test/rocprim/test_device_merge.cpp +++ b/test/rocprim/test_device_merge.cpp @@ -20,9 +20,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include #include #include -#include // Google Test #include @@ -37,39 +37,35 @@ #define HIP_CHECK(error) ASSERT_EQ(error, hipSuccess) // Params for tests -template< - class KeyType, - class ValueType, - class CompareOp = ::rocprim::less -> +template > struct DeviceMergeParams { - using key_type = KeyType; - using value_type = ValueType; + using key_type = KeyType; + using value_type = ValueType; using compare_op_type = CompareOp; }; -template +template class RocprimDeviceMergeTests : public ::testing::Test { public: - using key_type = typename Params::key_type; - using value_type = typename Params::value_type; - using compare_op_type = typename Params::compare_op_type; + using key_type = typename Params::key_type; + using value_type = typename Params::value_type; + using compare_op_type = typename Params::compare_op_type; const bool debug_synchronous = false; }; -using custom_int2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< DeviceMergeParams, - DeviceMergeParams >, + DeviceMergeParams>, DeviceMergeParams, DeviceMergeParams, - DeviceMergeParams >, - DeviceMergeParams -> RocprimDeviceMergeTestsParams; + DeviceMergeParams>, + DeviceMergeParams> + RocprimDeviceMergeTestsParams; // size1, size2 std::vector> get_sizes() @@ -95,18 +91,16 @@ TYPED_TEST_CASE(RocprimDeviceMergeTests, RocprimDeviceMergeTestsParams); TYPED_TEST(RocprimDeviceMergeTests, MergeKey) { - using key_type = typename TestFixture::key_type; - using compare_op_type = typename TestFixture::compare_op_type; + using key_type = typename TestFixture::key_type; + using compare_op_type = typename TestFixture::compare_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default for(auto sizes : get_sizes()) { - SCOPED_TRACE( - testing::Message() << "with sizes = {" << - std::get<0>(sizes) << ", " << std::get<1>(sizes) << "}" - ); + SCOPED_TRACE(testing::Message() << "with sizes = {" << std::get<0>(sizes) << ", " + << std::get<1>(sizes) << "}"); const size_t size1 = std::get<0>(sizes); const size_t size2 = std::get<1>(sizes); @@ -123,57 +117,47 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) // Calculate expected results on host std::vector expected(keys_output.size()); - std::merge( - keys_input1.begin(), - keys_input1.end(), - keys_input2.begin(), - keys_input2.end(), - expected.begin(), - compare_op - ); + std::merge(keys_input1.begin(), + keys_input1.end(), + keys_input2.begin(), + keys_input2.end(), + expected.begin(), + compare_op); test_utils::out_of_bounds_flag out_of_bounds; - key_type * d_keys_input1; - key_type * d_keys_input2; - key_type * d_keys_output; + key_type* d_keys_input1; + key_type* d_keys_input2; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input1, keys_input1.size() * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_input2, keys_input2.size() * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, keys_output.size() * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input1, keys_input1.data(), - keys_input1.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_keys_input2, keys_input2.data(), - keys_input2.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_keys_input1, + keys_input1.data(), + keys_input1.size() * sizeof(key_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_keys_input2, + keys_input2.data(), + keys_input2.size() * sizeof(key_type), + hipMemcpyHostToDevice)); test_utils::bounds_checking_iterator d_keys_checking_output( - d_keys_output, - out_of_bounds.device_pointer(), - size1 + size2 - ); + d_keys_output, out_of_bounds.device_pointer(), size1 + size2); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::merge( - d_temp_storage, temp_storage_size_bytes, - d_keys_input1, d_keys_input2, - d_keys_checking_output, - keys_input1.size(), keys_input2.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge(d_temp_storage, + temp_storage_size_bytes, + d_keys_input1, + d_keys_input2, + d_keys_checking_output, + keys_input1.size(), + keys_input2.size(), + compare_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -182,28 +166,26 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Run - HIP_CHECK( - rocprim::merge( - d_temp_storage, temp_storage_size_bytes, - d_keys_input1, d_keys_input2, - d_keys_checking_output, - keys_input1.size(), keys_input2.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge(d_temp_storage, + temp_storage_size_bytes, + d_keys_input1, + d_keys_input2, + d_keys_checking_output, + keys_input1.size(), + keys_input2.size(), + compare_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); ASSERT_FALSE(out_of_bounds.get()); // Copy keys_output to host - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys_output, - keys_output.size() * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + d_keys_output, + keys_output.size() * sizeof(key_type), + hipMemcpyDeviceToHost)); // Check if keys_output values are as expected for(size_t i = 0; i < keys_output.size(); i++) @@ -220,9 +202,9 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) { - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using compare_op_type = typename TestFixture::compare_op_type; + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using compare_op_type = typename TestFixture::compare_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; using key_value = std::pair; @@ -231,10 +213,8 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) for(auto sizes : get_sizes()) { - SCOPED_TRACE( - testing::Message() << "with sizes = {" << - std::get<0>(sizes) << ", " << std::get<1>(sizes) << "}" - ); + SCOPED_TRACE(testing::Message() << "with sizes = {" << std::get<0>(sizes) << ", " + << std::get<1>(sizes) << "}"); const size_t size1 = std::get<0>(sizes); const size_t size2 = std::get<1>(sizes); @@ -251,7 +231,7 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) std::vector values_input2(size2); std::iota(values_input1.begin(), values_input1.end(), 0); std::iota(values_input2.begin(), values_input2.end(), size1); - std::vector keys_output(size1 + size2, 0); + std::vector keys_output(size1 + size2, 0); std::vector values_output(size1 + size2, 0); // Calculate expected results on host @@ -268,84 +248,68 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) } std::vector expected(size1 + size2); - std::merge( - vector1.begin(), - vector1.end(), - vector2.begin(), - vector2.end(), - expected.begin(), - [compare_op](const key_value& a, const key_value& b) { return compare_op(a.first, b.first); } - ); + std::merge(vector1.begin(), + vector1.end(), + vector2.begin(), + vector2.end(), + expected.begin(), + [compare_op](const key_value& a, const key_value& b) { + return compare_op(a.first, b.first); + }); test_utils::out_of_bounds_flag out_of_bounds; - key_type * d_keys_input1; - key_type * d_keys_input2; - key_type * d_keys_output; - value_type * d_values_input1; - value_type * d_values_input2; - value_type * d_values_output; + key_type* d_keys_input1; + key_type* d_keys_input2; + key_type* d_keys_output; + value_type* d_values_input1; + value_type* d_values_input2; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_keys_input1, keys_input1.size() * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_input2, keys_input2.size() * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, keys_output.size() * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_values_input1, values_input1.size() * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_input2, values_input2.size() * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, values_output.size() * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input1, keys_input1.data(), - keys_input1.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_keys_input2, keys_input2.data(), - keys_input2.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_values_input1, values_input1.data(), - values_input1.size() * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_values_input2, values_input2.data(), - values_input2.size() * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_keys_input1, + keys_input1.data(), + keys_input1.size() * sizeof(key_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_keys_input2, + keys_input2.data(), + keys_input2.size() * sizeof(key_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values_input1, + values_input1.data(), + values_input1.size() * sizeof(value_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values_input2, + values_input2.data(), + values_input2.size() * sizeof(value_type), + hipMemcpyHostToDevice)); test_utils::bounds_checking_iterator d_keys_checking_output( - d_keys_output, - out_of_bounds.device_pointer(), - size1 + size2 - ); + d_keys_output, out_of_bounds.device_pointer(), size1 + size2); test_utils::bounds_checking_iterator d_values_checking_output( - d_values_output, - out_of_bounds.device_pointer(), - size1 + size2 - ); + d_values_output, out_of_bounds.device_pointer(), size1 + size2); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::merge( - d_temp_storage, temp_storage_size_bytes, - d_keys_input1, d_keys_input2, - d_keys_checking_output, - d_values_input1, d_values_input2, - d_values_checking_output, - keys_input1.size(), keys_input2.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge(d_temp_storage, + temp_storage_size_bytes, + d_keys_input1, + d_keys_input2, + d_keys_checking_output, + d_values_input1, + d_values_input2, + d_values_checking_output, + keys_input1.size(), + keys_input2.size(), + compare_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -354,36 +318,32 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Run - HIP_CHECK( - rocprim::merge( - d_temp_storage, temp_storage_size_bytes, - d_keys_input1, d_keys_input2, - d_keys_checking_output, - d_values_input1, d_values_input2, - d_values_checking_output, - keys_input1.size(), keys_input2.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge(d_temp_storage, + temp_storage_size_bytes, + d_keys_input1, + d_keys_input2, + d_keys_checking_output, + d_values_input1, + d_values_input2, + d_values_checking_output, + keys_input1.size(), + keys_input2.size(), + compare_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); ASSERT_FALSE(out_of_bounds.get()); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys_output, - keys_output.size() * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values_output, - values_output.size() * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + d_keys_output, + keys_output.size() * sizeof(key_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values_output, + values_output.size() * sizeof(value_type), + hipMemcpyDeviceToHost)); // Check if keys_output values are as expected for(size_t i = 0; i < keys_output.size(); i++) diff --git a/test/rocprim/test_device_merge_sort.cpp b/test/rocprim/test_device_merge_sort.cpp index f7a04e085..80e9e7d83 100644 --- a/test/rocprim/test_device_merge_sort.cpp +++ b/test/rocprim/test_device_merge_sort.cpp @@ -20,9 +20,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include #include #include -#include // Google Test #include @@ -39,15 +39,13 @@ namespace rp = rocprim; // Params for tests -template< - class KeyType, - class ValueType = KeyType, - class CompareFunction = ::rocprim::less -> +template > struct DeviceSortParams { - using key_type = KeyType; - using value_type = ValueType; + using key_type = KeyType; + using value_type = ValueType; using compare_function = CompareFunction; }; @@ -55,13 +53,13 @@ struct DeviceSortParams // Test for reduce ops taking single input value // --------------------------------------------------------- -template +template class RocprimDeviceSortTests : public ::testing::Test { public: - using key_type = typename Params::key_type; - using value_type = typename Params::value_type; - using compare_function = typename Params::compare_function; + using key_type = typename Params::key_type; + using value_type = typename Params::value_type; + using compare_function = typename Params::compare_function; const bool debug_synchronous = false; }; @@ -75,17 +73,13 @@ typedef ::testing::Types< DeviceSortParams>, DeviceSortParams>, DeviceSortParams>, - DeviceSortParams, test_utils::custom_test_type> -> RocprimDeviceSortTestsParams; + DeviceSortParams, test_utils::custom_test_type>> + RocprimDeviceSortTestsParams; std::vector get_sizes() { std::vector sizes = { - 1, 10, 53, 211, - 128, 256, 512, - 1024, 2048, 5000, - 34567, (1 << 17) - 1220, (1 << 20) - 123 - }; + 1, 10, 53, 211, 128, 256, 512, 1024, 2048, 5000, 34567, (1 << 17) - 1220, (1 << 20) - 123}; const std::vector random_sizes = test_utils::get_random_data(5, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); @@ -96,8 +90,8 @@ TYPED_TEST_CASE(RocprimDeviceSortTests, RocprimDeviceSortTestsParams); TYPED_TEST(RocprimDeviceSortTests, SortKey) { - using key_type = typename TestFixture::key_type; - using compare_function = typename TestFixture::compare_function; + using key_type = typename TestFixture::key_type; + using compare_function = typename TestFixture::compare_function; const bool debug_synchronous = TestFixture::debug_synchronous; bool in_place = false; @@ -114,8 +108,8 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) std::vector input = test_utils::get_random_data(size, 0, size); std::vector output(size); - key_type * d_input; - key_type * d_output; + key_type* d_input; + key_type* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(key_type))); if(in_place) { @@ -125,13 +119,8 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) { HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(key_type))); } - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_input, input.data(), input.size() * sizeof(key_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // compare function @@ -139,23 +128,20 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) // Calculate expected results on host std::vector expected(input); - std::stable_sort( - expected.begin(), - expected.end(), - compare_op - ); + std::stable_sort(expected.begin(), expected.end(), compare_op); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::merge_sort( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, input.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge_sort(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + input.size(), + compare_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -165,24 +151,20 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::merge_sort( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, input.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge_sort(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + input.size(), + compare_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, output.size() * sizeof(key_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected @@ -202,9 +184,9 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) { - using key_type = typename TestFixture::key_type; - using value_type = typename TestFixture::value_type; - using compare_function = typename TestFixture::compare_function; + using key_type = typename TestFixture::key_type; + using value_type = typename TestFixture::value_type; + using compare_function = typename TestFixture::compare_function; const bool debug_synchronous = TestFixture::debug_synchronous; bool in_place = false; @@ -223,11 +205,11 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - std::vector keys_output(size); + std::vector keys_output(size); std::vector values_output(size); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, keys_input.size() * sizeof(key_type))); if(in_place) { @@ -237,17 +219,14 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) { HIP_CHECK(hipMalloc(&d_keys_output, keys_output.size() * sizeof(key_type))); } - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - keys_input.size() * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_keys_input, + keys_input.data(), + keys_input.size() * sizeof(key_type), + hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - value_type * d_values_input; - value_type * d_values_output; + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, values_input.size() * sizeof(value_type))); if(in_place) { @@ -257,13 +236,10 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) { HIP_CHECK(hipMalloc(&d_values_output, values_output.size() * sizeof(value_type))); } - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - values_input.size() * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + values_input.size() * sizeof(value_type), + hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // compare function @@ -277,23 +253,24 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) expected[i] = key_value(keys_input[i], values_input[i]); } std::stable_sort( - expected.begin(), - expected.end(), - [compare_op](const key_value& a, const key_value& b) { return compare_op(a.first, b.first); } - ); + expected.begin(), expected.end(), [compare_op](const key_value& a, const key_value& b) { + return compare_op(a.first, b.first); + }); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::merge_sort( - d_temp_storage, temp_storage_size_bytes, - d_keys_input, d_keys_output, - d_values_input, d_values_output, keys_input.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge_sort(d_temp_storage, + temp_storage_size_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + keys_input.size(), + compare_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -303,32 +280,28 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::merge_sort( - d_temp_storage, temp_storage_size_bytes, - d_keys_input, d_keys_output, - d_values_input, d_values_output, keys_input.size(), - compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::merge_sort(d_temp_storage, + temp_storage_size_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + keys_input.size(), + compare_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys_output, - keys_output.size() * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values_output, - values_output.size() * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(keys_output.data(), + d_keys_output, + keys_output.size() * sizeof(key_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values_output, + values_output.size() * sizeof(value_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected diff --git a/test/rocprim/test_device_partition.cpp b/test/rocprim/test_device_partition.cpp index 76f63ba63..cfe2a5cff 100644 --- a/test/rocprim/test_device_partition.cpp +++ b/test/rocprim/test_device_partition.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include #include +#include +#include // Google Test #include @@ -34,50 +34,42 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) // Params for tests -template< - class InputType, - class OutputType = InputType, - class FlagType = unsigned int, - bool UseIdentityIterator = false -> +template struct DevicePartitionParams { - using input_type = InputType; - using output_type = OutputType; - using flag_type = FlagType; + using input_type = InputType; + using output_type = OutputType; + using flag_type = FlagType; static constexpr bool use_identity_iterator = UseIdentityIterator; }; -template +template class RocprimDevicePartitionTests : public ::testing::Test { public: - using input_type = typename Params::input_type; - using output_type = typename Params::output_type; - using flag_type = typename Params::flag_type; - const bool debug_synchronous = false; + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; + using flag_type = typename Params::flag_type; + const bool debug_synchronous = false; static constexpr bool use_identity_iterator = Params::use_identity_iterator; }; -typedef ::testing::Types< - DevicePartitionParams, - DevicePartitionParams, - DevicePartitionParams, - DevicePartitionParams> -> RocprimDevicePartitionTestsParams; +typedef ::testing::Types, + DevicePartitionParams, + DevicePartitionParams, + DevicePartitionParams>> + RocprimDevicePartitionTestsParams; std::vector get_sizes() { - std::vector sizes = { - 2, 32, 64, 256, - 1024, 2048, - 3072, 4096, - 27845, (1 << 18) + 1111, - 1024 * 1024 * 32 - }; + std::vector sizes + = {2, 32, 64, 256, 1024, 2048, 3072, 4096, 27845, (1 << 18) + 1111, 1024 * 1024 * 32}; const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); @@ -88,11 +80,11 @@ TYPED_TEST_CASE(RocprimDevicePartitionTests, RocprimDevicePartitionTestsParams); TYPED_TEST(RocprimDevicePartitionTests, Flagged) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; - using F = typename TestFixture::flag_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + using F = typename TestFixture::flag_type; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; - const bool debug_synchronous = TestFixture::debug_synchronous; + const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream @@ -105,35 +97,25 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) std::vector input = test_utils::get_random_data(size, 1, 100); std::vector flags = test_utils::get_random_data01(size, 0.25); - T * d_input; - F * d_flags; - U * d_output; - unsigned int * d_selected_count_output; + T* d_input; + F* d_flags; + U* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(F))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(U))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(F), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected_selected and expected_rejected results on host std::vector expected_selected; std::vector expected_rejected; - expected_selected.reserve(input.size()/2); - expected_rejected.reserve(input.size()/2); + expected_selected.reserve(input.size() / 2); + expected_rejected.reserve(input.size() / 2); for(size_t i = 0; i < input.size(); i++) { if(flags[i] != 0) @@ -150,66 +132,52 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::partition( - nullptr, - temp_storage_size_bytes, - d_input, - d_flags, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::partition( + nullptr, + temp_storage_size_bytes, + d_input, + d_flags, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::partition( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::partition( + d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if number of selected value is as expected_selected unsigned int selected_count_output = 0; - HIP_CHECK( - hipMemcpy( - &selected_count_output, d_selected_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_EQ(selected_count_output, expected_selected.size()); // Check if output values are as expected_selected std::vector output(input.size()); HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < expected_selected.size(); i++) @@ -232,86 +200,69 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) TYPED_TEST(RocprimDevicePartitionTests, PredicateEmptyInput) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream - auto select_op = [] __host__ __device__ (const T& value) -> bool - { - if(value == T(50)) return true; + auto select_op = [] __host__ __device__(const T& value) -> bool { + if(value == T(50)) + return true; return false; }; - U * d_output; - unsigned int * d_selected_count_output; + U* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_output, sizeof(U))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); unsigned int selected_count_output = 123; - HIP_CHECK( - hipMemcpy( - d_selected_count_output, &selected_count_output, - sizeof(unsigned int), - hipMemcpyHostToDevice - ) - ); - - test_utils::out_of_bounds_flag out_of_bounds; + HIP_CHECK(hipMemcpy(d_selected_count_output, + &selected_count_output, + sizeof(unsigned int), + hipMemcpyHostToDevice)); + + test_utils::out_of_bounds_flag out_of_bounds; test_utils::bounds_checking_iterator d_checking_output( - d_output, - out_of_bounds.device_pointer(), - 0 - ); + d_output, out_of_bounds.device_pointer(), 0); // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::partition( - nullptr, - temp_storage_size_bytes, - rocprim::make_constant_iterator(T(345)), - d_checking_output, - d_selected_count_output, - 0, - select_op, - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::partition(nullptr, + temp_storage_size_bytes, + rocprim::make_constant_iterator(T(345)), + d_checking_output, + d_selected_count_output, + 0, + select_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Run - HIP_CHECK( - rocprim::partition( - d_temp_storage, - temp_storage_size_bytes, - rocprim::make_constant_iterator(T(345)), - d_checking_output, - d_selected_count_output, - 0, - select_op, - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::partition(d_temp_storage, + temp_storage_size_bytes, + rocprim::make_constant_iterator(T(345)), + d_checking_output, + d_selected_count_output, + 0, + select_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_FALSE(out_of_bounds.get()); // Check if number of selected value is 0 - HIP_CHECK( - hipMemcpy( - &selected_count_output, d_selected_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); ASSERT_EQ(selected_count_output, 0); hipFree(d_output); @@ -321,16 +272,16 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateEmptyInput) TYPED_TEST(RocprimDevicePartitionTests, Predicate) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; - const bool debug_synchronous = TestFixture::debug_synchronous; + const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream - auto select_op = [] __host__ __device__ (const T& value) -> bool - { - if(value == T(50)) return true; + auto select_op = [] __host__ __device__(const T& value) -> bool { + if(value == T(50)) + return true; return false; }; @@ -342,26 +293,21 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) // Generate data std::vector input = test_utils::get_random_data(size, 1, 100); - T * d_input; - U * d_output; - unsigned int * d_selected_count_output; + T* d_input; + U* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(U))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected_selected and expected_rejected results on host std::vector expected_selected; std::vector expected_rejected; - expected_selected.reserve(input.size()/2); - expected_rejected.reserve(input.size()/2); + expected_selected.reserve(input.size() / 2); + expected_rejected.reserve(input.size() / 2); for(size_t i = 0; i < input.size(); i++) { if(select_op(input[i])) @@ -378,66 +324,52 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::partition( - nullptr, - temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - select_op, - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::partition( + nullptr, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::partition( - d_temp_storage, - temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - select_op, - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::partition( + d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if number of selected value is as expected_selected unsigned int selected_count_output = 0; - HIP_CHECK( - hipMemcpy( - &selected_count_output, d_selected_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_EQ(selected_count_output, expected_selected.size()); // Check if output values are as expected_selected std::vector output(input.size()); HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < expected_selected.size(); i++) diff --git a/test/rocprim/test_device_radix_sort.cpp b/test/rocprim/test_device_radix_sort.cpp index cfb7655ea..b9977e0db 100644 --- a/test/rocprim/test_device_radix_sort.cpp +++ b/test/rocprim/test_device_radix_sort.cpp @@ -24,8 +24,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -41,71 +41,70 @@ namespace rp = rocprim; #define HIP_CHECK(error) ASSERT_EQ(error, hipSuccess) -template< - class Key, - class Value, - bool Descending = false, - unsigned int StartBit = 0, - unsigned int EndBit = sizeof(Key) * 8, - bool CheckHugeSizes = false -> +template struct params { - using key_type = Key; - using value_type = Value; - static constexpr bool descending = Descending; - static constexpr unsigned int start_bit = StartBit; - static constexpr unsigned int end_bit = EndBit; - static constexpr bool check_huge_sizes = CheckHugeSizes; + using key_type = Key; + using value_type = Value; + static constexpr bool descending = Descending; + static constexpr unsigned int start_bit = StartBit; + static constexpr unsigned int end_bit = EndBit; + static constexpr bool check_huge_sizes = CheckHugeSizes; }; -template -class RocprimDeviceRadixSort : public ::testing::Test { +template +class RocprimDeviceRadixSort : public ::testing::Test +{ public: using params = Params; }; -typedef ::testing::Types< - params, - params, - params, - params, - params, - params, - params, - params, - params>, - - // start_bit and end_bit - params, - params, - params, - params, - params, - params, - params, false, 8, 11>, - - // huge sizes to check correctness of more than 1 block per batch - params -> Params; +typedef ::testing::Types, + params, + params, + params, + params, + params, + params, + params, + params>, + + // start_bit and end_bit + params, + params, + params, + params, + params, + params, + params, false, 8, 11>, + + // huge sizes to check correctness of more than 1 block per batch + params> + Params; TYPED_TEST_CASE(RocprimDeviceRadixSort, Params); -template +template struct key_comparator { - static_assert(rp::is_unsigned::value, "Test supports start and end bits only for unsigned integers"); + static_assert(rp::is_unsigned::value, + "Test supports start and end bits only for unsigned integers"); bool operator()(const Key& lhs, const Key& rhs) { auto mask = (1ull << (EndBit - StartBit)) - 1; - auto l = (static_cast(lhs) >> StartBit) & mask; - auto r = (static_cast(rhs) >> StartBit) & mask; + auto l = (static_cast(lhs) >> StartBit) & mask; + auto r = (static_cast(rhs) >> StartBit) & mask; return Descending ? (r < l) : (l < r); } }; -template +template struct key_comparator { bool operator()(const Key& lhs, const Key& rhs) @@ -114,7 +113,7 @@ struct key_comparator } }; -template +template struct key_comparator { bool operator()(const rp::half& lhs, const rp::half& rhs) @@ -124,7 +123,7 @@ struct key_comparator } }; -template +template struct key_value_comparator { bool operator()(const std::pair& lhs, const std::pair& rhs) @@ -135,7 +134,8 @@ struct key_value_comparator std::vector get_sizes() { - std::vector sizes = { 1, 10, 53, 211, 1024, 2345, 4096, 34567, (1 << 16) - 1220, (1 << 23) - 76543 }; + std::vector sizes + = {1, 10, 53, 211, 1024, 2345, 4096, 34567, (1 << 16) - 1220, (1 << 23) - 76543}; const std::vector random_sizes = test_utils::get_random_data(10, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); return sizes; @@ -143,11 +143,11 @@ std::vector get_sizes() TYPED_TEST(RocprimDeviceRadixSort, SortKeys) { - using key_type = typename TestFixture::params::key_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; + using key_type = typename TestFixture::params::key_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; hipStream_t stream = 0; @@ -157,7 +157,8 @@ TYPED_TEST(RocprimDeviceRadixSort, SortKeys) for(size_t size : get_sizes()) { - if(size > (1 << 20) && !check_huge_sizes) continue; + if(size > (1 << 20) && !check_huge_sizes) + continue; SCOPED_TRACE(testing::Message() << "with size = " << size); @@ -167,19 +168,17 @@ TYPED_TEST(RocprimDeviceRadixSort, SortKeys) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); if(in_place) { @@ -189,67 +188,61 @@ TYPED_TEST(RocprimDeviceRadixSort, SortKeys) { HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); } - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector expected(keys_input); - std::stable_sort(expected.begin(), expected.end(), key_comparator()); + std::stable_sort(expected.begin(), + expected.end(), + key_comparator()); // Use custom config - using config = rp::radix_sort_config<8, 5, rp::kernel_config<256, 3>, rp::kernel_config<256, 8>>; + using config + = rp::radix_sort_config<8, 5, rp::kernel_config<256, 3>, rp::kernel_config<256, 8>>; size_t temporary_storage_bytes; - HIP_CHECK( - rp::radix_sort_keys( - nullptr, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::radix_sort_keys(nullptr, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + start_bit, + end_bit)); ASSERT_GT(temporary_storage_bytes, 0); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(descending) { - HIP_CHECK( - rp::radix_sort_keys_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_keys_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } - std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys_output, - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys_output, size * sizeof(key_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); @@ -264,12 +257,12 @@ TYPED_TEST(RocprimDeviceRadixSort, SortKeys) TYPED_TEST(RocprimDeviceRadixSort, SortPairs) { - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; hipStream_t stream = 0; @@ -279,7 +272,8 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairs) for(size_t size : get_sizes()) { - if(size > (1 << 20) && !check_huge_sizes) continue; + if(size > (1 << 20) && !check_huge_sizes) + continue; SCOPED_TRACE(testing::Message() << "with size = " << size); @@ -289,22 +283,20 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairs) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); if(in_place) { @@ -314,16 +306,11 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairs) { HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); } - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); if(in_place) { @@ -333,13 +320,8 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairs) { HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); } - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); using key_value = std::pair; @@ -350,26 +332,28 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairs) expected[i] = key_value(keys_input[i], values_input[i]); } std::stable_sort( - expected.begin(), expected.end(), - key_value_comparator() - ); - std::vector keys_expected(size); + expected.begin(), + expected.end(), + key_value_comparator()); + std::vector keys_expected(size); std::vector values_expected(size); for(size_t i = 0; i < size; i++) { - keys_expected[i] = expected[i].first; + keys_expected[i] = expected[i].first; values_expected[i] = expected[i].second; } - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK( - rp::radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + start_bit, + end_bit)); ASSERT_GT(temporary_storage_bytes, 0); @@ -377,45 +361,42 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairs) if(descending) { - HIP_CHECK( - rp::radix_sort_pairs_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_pairs_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } - std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys_output, - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys_output, size * sizeof(key_type), hipMemcpyDeviceToHost)); std::vector values_output(size); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values_output, - size * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values_output, + size * sizeof(value_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); @@ -433,11 +414,11 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairs) TYPED_TEST(RocprimDeviceRadixSort, SortKeysDoubleBuffer) { - using key_type = typename TestFixture::params::key_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; + using key_type = typename TestFixture::params::key_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; hipStream_t stream = 0; @@ -446,7 +427,8 @@ TYPED_TEST(RocprimDeviceRadixSort, SortKeysDoubleBuffer) const std::vector sizes = get_sizes(); for(size_t size : sizes) { - if(size > (1 << 20) && !check_huge_sizes) continue; + if(size > (1 << 20) && !check_huge_sizes) + continue; SCOPED_TRACE(testing::Message() << "with size = " << size); @@ -454,82 +436,67 @@ TYPED_TEST(RocprimDeviceRadixSort, SortKeysDoubleBuffer) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); // Calculate expected results on host std::vector expected(keys_input); - std::stable_sort(expected.begin(), expected.end(), key_comparator()); + std::stable_sort(expected.begin(), + expected.end(), + key_comparator()); rp::double_buffer d_keys(d_keys_input, d_keys_output); size_t temporary_storage_bytes; - HIP_CHECK( - rp::radix_sort_keys( - nullptr, temporary_storage_bytes, - d_keys, size, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::radix_sort_keys( + nullptr, temporary_storage_bytes, d_keys, size, start_bit, end_bit)); ASSERT_GT(temporary_storage_bytes, 0); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(descending) { - HIP_CHECK( - rp::radix_sort_keys_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_keys_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } HIP_CHECK(hipFree(d_temporary_storage)); std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys.current(), - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys.current(), size * sizeof(key_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); @@ -540,12 +507,12 @@ TYPED_TEST(RocprimDeviceRadixSort, SortKeysDoubleBuffer) TYPED_TEST(RocprimDeviceRadixSort, SortPairsDoubleBuffer) { - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; - constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; + constexpr bool check_huge_sizes = TestFixture::params::check_huge_sizes; hipStream_t stream = 0; @@ -554,7 +521,8 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairsDoubleBuffer) const std::vector sizes = get_sizes(); for(size_t size : sizes) { - if(size > (1 << 20) && !check_huge_sizes) continue; + if(size > (1 << 20) && !check_huge_sizes) + continue; SCOPED_TRACE(testing::Message() << "with size = " << size); @@ -562,43 +530,31 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairsDoubleBuffer) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); using key_value = std::pair; @@ -609,29 +565,29 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairsDoubleBuffer) expected[i] = key_value(keys_input[i], values_input[i]); } std::stable_sort( - expected.begin(), expected.end(), - key_value_comparator() - ); - std::vector keys_expected(size); + expected.begin(), + expected.end(), + key_value_comparator()); + std::vector keys_expected(size); std::vector values_expected(size); for(size_t i = 0; i < size; i++) { - keys_expected[i] = expected[i].first; + keys_expected[i] = expected[i].first; values_expected[i] = expected[i].second; } - rp::double_buffer d_keys(d_keys_input, d_keys_output); + rp::double_buffer d_keys(d_keys_input, d_keys_output); rp::double_buffer d_values(d_values_input, d_values_output); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK( - rp::radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys, d_values, size, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys, + d_values, + size, + start_bit, + end_bit)); ASSERT_GT(temporary_storage_bytes, 0); @@ -639,46 +595,40 @@ TYPED_TEST(RocprimDeviceRadixSort, SortPairsDoubleBuffer) if(descending) { - HIP_CHECK( - rp::radix_sort_pairs_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys, d_values, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_pairs_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys, + d_values, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys, d_values, size, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys, + d_values, + size, + start_bit, + end_bit, + stream, + debug_synchronous)); } HIP_CHECK(hipFree(d_temporary_storage)); std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys.current(), - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys.current(), size * sizeof(key_type), hipMemcpyDeviceToHost)); std::vector values_output(size); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values.current(), - size * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values.current(), + size * sizeof(value_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); diff --git a/test/rocprim/test_device_reduce.cpp b/test/rocprim/test_device_reduce.cpp index 68e1ced3e..0a6e04d74 100644 --- a/test/rocprim/test_device_reduce.cpp +++ b/test/rocprim/test_device_reduce.cpp @@ -20,9 +20,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include #include #include -#include // Google Test #include @@ -34,36 +34,31 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; // Params for tests -template< - class InputType, - class OutputType = InputType, - bool UseIdentityIterator = false -> +template struct DeviceReduceParams { - using input_type = InputType; + using input_type = InputType; using output_type = OutputType; // Tests output iterator with void value_type (OutputIterator concept) - static constexpr bool use_identity_iterator = UseIdentityIterator; + static constexpr bool use_identity_iterator = UseIdentityIterator; }; // --------------------------------------------------------- // Test for reduce ops taking single input value // --------------------------------------------------------- -template +template class RocprimDeviceReduceTests : public ::testing::Test { public: - using input_type = typename Params::input_type; - using output_type = typename Params::output_type; - const bool debug_synchronous = false; + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; + const bool debug_synchronous = false; static constexpr bool use_identity_iterator = Params::use_identity_iterator; }; @@ -73,16 +68,12 @@ typedef ::testing::Types< DeviceReduceParams, DeviceReduceParams, DeviceReduceParams, test_utils::custom_test_type>, - DeviceReduceParams, test_utils::custom_test_type> -> RocprimDeviceReduceTestsParams; + DeviceReduceParams, test_utils::custom_test_type>> + RocprimDeviceReduceTestsParams; std::vector get_sizes() { - std::vector sizes = { - 1, 10, 53, 211, - 1024, 2048, 5096, - 34567, (1 << 17) - 1220 - }; + std::vector sizes = {1, 10, 53, 211, 1024, 2048, 5096, 34567, (1 << 17) - 1220}; const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); @@ -93,52 +84,46 @@ TYPED_TEST_CASE(RocprimDeviceReduceTests, RocprimDeviceReduceTestsParams); TYPED_TEST(RocprimDeviceReduceTests, ReduceEmptyInput) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream - U * d_output; + U* d_output; HIP_CHECK(hipMalloc(&d_output, sizeof(U))); const U initial_value = U(1234); size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::reduce( - nullptr, temp_storage_size_bytes, - rocprim::make_constant_iterator(T(345)), - d_output, - initial_value, - 0, rocprim::minimum(), stream, debug_synchronous - ) - ); - - void * d_temp_storage = nullptr; + HIP_CHECK(rocprim::reduce(nullptr, + temp_storage_size_bytes, + rocprim::make_constant_iterator(T(345)), + d_output, + initial_value, + 0, + rocprim::minimum(), + stream, + debug_synchronous)); + + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Run - HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - rocprim::make_constant_iterator(T(345)), - d_output, - initial_value, - 0, rocprim::minimum(), stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + rocprim::make_constant_iterator(T(345)), + d_output, + initial_value, + 0, + rocprim::minimum(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); U output; - HIP_CHECK( - hipMemcpy( - &output, d_output, - sizeof(U), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&output, d_output, sizeof(U), hipMemcpyDeviceToHost)); ASSERT_EQ(output, initial_value); hipFree(d_output); @@ -147,9 +132,9 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceEmptyInput) TYPED_TEST(RocprimDeviceReduceTests, Reduce) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; - const bool debug_synchronous = TestFixture::debug_synchronous; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + const bool debug_synchronous = TestFixture::debug_synchronous; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; const std::vector sizes = get_sizes(); @@ -166,17 +151,12 @@ TYPED_TEST(RocprimDeviceReduceTests, Reduce) // reduce function ::rocprim::plus plus_op; - T * d_input; - U * d_output; + T* d_input; + U* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host @@ -188,16 +168,17 @@ TYPED_TEST(RocprimDeviceReduceTests, Reduce) // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - input.size(), plus_op, stream, debug_synchronous - ) - ); + rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + input.size(), + plus_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -208,24 +189,20 @@ TYPED_TEST(RocprimDeviceReduceTests, Reduce) // Run HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - input.size(), plus_op, stream, debug_synchronous - ) - ); + rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + input.size(), + plus_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected @@ -239,9 +216,9 @@ TYPED_TEST(RocprimDeviceReduceTests, Reduce) TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; - const bool debug_synchronous = TestFixture::debug_synchronous; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + const bool debug_synchronous = TestFixture::debug_synchronous; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; const std::vector sizes = get_sizes(); @@ -255,17 +232,12 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) std::vector input = test_utils::get_random_data(size, 1, 100); std::vector output(1, 0); - T * d_input; - U * d_output; + T* d_input; + U* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // reduce function @@ -280,16 +252,18 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - test_utils::numeric_limits::max(), input.size(), min_op, stream, debug_synchronous - ) - ); + rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + test_utils::numeric_limits::max(), + input.size(), + min_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -300,24 +274,21 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) // Run HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - test_utils::numeric_limits::max(), input.size(), min_op, stream, debug_synchronous - ) - ); + rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + test_utils::numeric_limits::max(), + input.size(), + min_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected @@ -329,16 +300,12 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) } } -template< - class Key, - class Value -> +template struct arg_min { - ROCPRIM_HOST_DEVICE inline - constexpr rocprim::key_value_pair - operator()(const rocprim::key_value_pair& a, - const rocprim::key_value_pair& b) const + ROCPRIM_HOST_DEVICE inline constexpr rocprim::key_value_pair + operator()(const rocprim::key_value_pair& a, + const rocprim::key_value_pair& b) const { return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; } @@ -346,9 +313,9 @@ struct arg_min TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) { - using T = typename TestFixture::input_type; - using key_value = rocprim::key_value_pair; - const bool debug_synchronous = TestFixture::debug_synchronous; + using T = typename TestFixture::input_type; + using key_value = rocprim::key_value_pair; + const bool debug_synchronous = TestFixture::debug_synchronous; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; const std::vector sizes = get_sizes(); @@ -360,24 +327,19 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) // Generate data std::vector input(size); - for (size_t i = 0; i < size; i++) + for(size_t i = 0; i < size; i++) { - input[i].key = i; + input[i].key = i; input[i].value = test_utils::get_random_value(1, 100); } std::vector output(1); - key_value * d_input; - key_value * d_output; + key_value* d_input; + key_value* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(key_value))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(key_value))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(key_value), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_input, input.data(), input.size() * sizeof(key_value), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); arg_min reduce_op; @@ -392,16 +354,18 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - max, input.size(), reduce_op, stream, debug_synchronous - ) - ); + rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + max, + input.size(), + reduce_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -412,24 +376,21 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) // Run HIP_CHECK( - rocprim::reduce( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - max, input.size(), reduce_op, stream, debug_synchronous - ) - ); + rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + max, + input.size(), + reduce_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(key_value), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, output.size() * sizeof(key_value), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected diff --git a/test/rocprim/test_device_reduce_by_key.cpp b/test/rocprim/test_device_reduce_by_key.cpp index f3e697b99..cc237cab5 100644 --- a/test/rocprim/test_device_reduce_by_key.cpp +++ b/test/rocprim/test_device_reduce_by_key.cpp @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -41,46 +41,44 @@ namespace rp = rocprim; -template< - class Key, - class Value, - class ReduceOp, - unsigned int MinSegmentLength, - unsigned int MaxSegmentLength, - class Aggregate = Value, - class KeyCompareFunction = ::rocprim::equal_to, - // Tests output iterator with void value_type (OutputIterator concept) - bool UseIdentityIterator = false -> +template , + // Tests output iterator with void value_type (OutputIterator concept) + bool UseIdentityIterator = false> struct params { - using key_type = Key; - using value_type = Value; - using reduce_op_type = ReduceOp; + using key_type = Key; + using value_type = Value; + using reduce_op_type = ReduceOp; static constexpr unsigned int min_segment_length = MinSegmentLength; static constexpr unsigned int max_segment_length = MaxSegmentLength; - using aggregate_type = Aggregate; - using key_compare_op = KeyCompareFunction; - static constexpr bool use_identity_iterator = UseIdentityIterator; + using aggregate_type = Aggregate; + using key_compare_op = KeyCompareFunction; + static constexpr bool use_identity_iterator = UseIdentityIterator; }; -template -class RocprimDeviceReduceByKey : public ::testing::Test { +template +class RocprimDeviceReduceByKey : public ::testing::Test +{ public: using params = Params; }; struct custom_reduce_op1 { - template - ROCPRIM_HOST_DEVICE - T operator()(T a, T b) + template + ROCPRIM_HOST_DEVICE T operator()(T a, T b) { return a + b; } }; -template +template struct custom_key_compare_op1 { ROCPRIM_HOST_DEVICE @@ -90,7 +88,7 @@ struct custom_key_compare_op1 } }; -using custom_int2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< @@ -101,7 +99,13 @@ typedef ::testing::Types< params, 1, 30>, params, params, 20, 100>, - params, 100, 400, long long, custom_key_compare_op1>, + params, + 100, + 400, + long long, + custom_key_compare_op1>, params, 200, 600>, params, 100, 2000, double, custom_key_compare_op1>, params, @@ -109,20 +113,28 @@ typedef ::testing::Types< params, 2048, 2048>, params, 1000, 10000, long long>, params, 1000, 50000>, - params, 100000, 100000> -> Params; + params, 100000, 100000>> + Params; TYPED_TEST_CASE(RocprimDeviceReduceByKey, Params); std::vector get_sizes() { - std::vector sizes = { - 1024, 2048, 4096, 1792, - 1, 10, 53, 211, 500, - 2345, 11001, 34567, - 100000, - (1 << 16) - 1220, (1 << 23) - 76543 - }; + std::vector sizes = {1024, + 2048, + 4096, + 1792, + 1, + 10, + 53, + 211, + 500, + 2345, + 11001, + 34567, + 100000, + (1 << 16) - 1220, + (1 << 23) - 76543}; const std::vector random_sizes = test_utils::get_random_data(10, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); return sizes; @@ -130,25 +142,24 @@ std::vector get_sizes() TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) { - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - using aggregate_type = typename TestFixture::params::aggregate_type; - using reduce_op_type = typename TestFixture::params::reduce_op_type; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + using aggregate_type = typename TestFixture::params::aggregate_type; + using reduce_op_type = typename TestFixture::params::reduce_op_type; using key_compare_op_type = typename TestFixture::params::key_compare_op; - using key_inner_type = typename test_utils::inner_type::type; - using key_distribution_type = typename std::conditional< - std::is_floating_point::value, - std::uniform_real_distribution, - std::uniform_int_distribution - >::type; + using key_inner_type = typename test_utils::inner_type::type; + using key_distribution_type = + typename std::conditional::value, + std::uniform_real_distribution, + std::uniform_int_distribution>::type; constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; - const bool debug_synchronous = false; + const bool debug_synchronous = false; - reduce_op_type reduce_op; + reduce_op_type reduce_op; key_compare_op_type key_compare_op; - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); for(size_t size : get_sizes()) @@ -160,20 +171,19 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) const bool use_unique_keys = bool(test_utils::get_random_value(0, 1)); // Generate data and calculate expected results - std::vector unique_expected; + std::vector unique_expected; std::vector aggregates_expected; - size_t unique_count_expected = 0; + size_t unique_count_expected = 0; - std::vector keys_input(size); - key_distribution_type key_delta_dis(1, 5); + std::vector keys_input(size); + key_distribution_type key_delta_dis(1, 5); std::uniform_int_distribution key_count_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); - std::vector values_input = test_utils::get_random_data(size, 0, 100); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); + std::vector values_input + = test_utils::get_random_data(size, 0, 100); - size_t offset = 0; - key_type prev_key = key_distribution_type(0, 100)(gen); + size_t offset = 0; + key_type prev_key = key_distribution_type(0, 100)(gen); key_type current_key = prev_key + key_delta_dis(gen); while(offset < size) { @@ -203,7 +213,7 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) aggregates_expected.back() = reduce_op(aggregates_expected.back(), aggregate); } - if (use_unique_keys) + if(use_unique_keys) { prev_key = current_key; // e.g. 1,1,1,2,5,5,8,8,8 @@ -218,88 +228,73 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) offset += key_count; } - key_type * d_keys_input; - value_type * d_values_input; + key_type* d_keys_input; + value_type* d_values_input; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - aggregate_type * d_aggregates_output; - unsigned int * d_unique_count_output; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); + + key_type* d_unique_output; + aggregate_type* d_aggregates_output; + unsigned int* d_unique_count_output; HIP_CHECK(hipMalloc(&d_unique_output, unique_count_expected * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count_expected * sizeof(aggregate_type))); HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); size_t temporary_storage_bytes; - HIP_CHECK( - rp::reduce_by_key( - nullptr, temporary_storage_bytes, - d_keys_input, d_values_input, size, - test_utils::wrap_in_identity_iterator(d_unique_output), - test_utils::wrap_in_identity_iterator(d_aggregates_output), - d_unique_count_output, - reduce_op, key_compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::reduce_by_key( + nullptr, + temporary_storage_bytes, + d_keys_input, + d_values_input, + size, + test_utils::wrap_in_identity_iterator(d_unique_output), + test_utils::wrap_in_identity_iterator(d_aggregates_output), + d_unique_count_output, + reduce_op, + key_compare_op, + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rp::reduce_by_key( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_values_input, size, - d_unique_output, d_aggregates_output, - d_unique_count_output, - reduce_op, key_compare_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::reduce_by_key(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + size, + d_unique_output, + d_aggregates_output, + d_unique_count_output, + reduce_op, + key_compare_op, + stream, + debug_synchronous)); HIP_CHECK(hipFree(d_temporary_storage)); - std::vector unique_output(unique_count_expected); + std::vector unique_output(unique_count_expected); std::vector aggregates_output(unique_count_expected); - std::vector unique_count_output(1); - HIP_CHECK( - hipMemcpy( - unique_output.data(), d_unique_output, - unique_count_expected * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); - HIP_CHECK( - hipMemcpy( - aggregates_output.data(), d_aggregates_output, - unique_count_expected * sizeof(aggregate_type), - hipMemcpyDeviceToHost - ) - ); - HIP_CHECK( - hipMemcpy( - unique_count_output.data(), d_unique_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + std::vector unique_count_output(1); + HIP_CHECK(hipMemcpy(unique_output.data(), + d_unique_output, + unique_count_expected * sizeof(key_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(aggregates_output.data(), + d_aggregates_output, + unique_count_expected * sizeof(aggregate_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(unique_count_output.data(), + d_unique_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_values_input)); diff --git a/test/rocprim/test_device_run_length_encode.cpp b/test/rocprim/test_device_run_length_encode.cpp index 8049b831e..60a4a1faf 100644 --- a/test/rocprim/test_device_run_length_encode.cpp +++ b/test/rocprim/test_device_run_length_encode.cpp @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -39,59 +39,65 @@ #define HIP_CHECK(error) ASSERT_EQ(error, hipSuccess) -template< - class Key, - class Count, - unsigned int MinSegmentLength, - unsigned int MaxSegmentLength, - // Tests output iterator with void value_type (OutputIterator concept) - bool UseIdentityIterator = false -> +template struct params { - using key_type = Key; - using count_type = Count; - static constexpr unsigned int min_segment_length = MinSegmentLength; - static constexpr unsigned int max_segment_length = MaxSegmentLength; - static constexpr bool use_identity_iterator = UseIdentityIterator; + using key_type = Key; + using count_type = Count; + static constexpr unsigned int min_segment_length = MinSegmentLength; + static constexpr unsigned int max_segment_length = MaxSegmentLength; + static constexpr bool use_identity_iterator = UseIdentityIterator; }; -template -class RocprimDeviceRunLengthEncode : public ::testing::Test { +template +class RocprimDeviceRunLengthEncode : public ::testing::Test +{ public: using params = Params; }; -using custom_int2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; -typedef ::testing::Types< - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params, - params -> Params; +typedef ::testing::Types, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params, + params> + Params; TYPED_TEST_CASE(RocprimDeviceRunLengthEncode, Params); std::vector get_sizes() { - std::vector sizes = { - 1024, 2048, 4096, 1792, - 1, 10, 53, 211, 500, - 2345, 11001, 34567, - 100000, - (1 << 16) - 1220, (1 << 21) - 76543 - }; + std::vector sizes = {1024, + 2048, + 4096, + 1792, + 1, + 10, + 53, + 211, + 500, + 2345, + 11001, + 34567, + 100000, + (1 << 16) - 1220, + (1 << 21) - 76543}; const std::vector random_sizes = test_utils::get_random_data(5, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); return sizes; @@ -99,19 +105,18 @@ std::vector get_sizes() TYPED_TEST(RocprimDeviceRunLengthEncode, Encode) { - using key_type = typename TestFixture::params::key_type; - using count_type = typename TestFixture::params::count_type; + using key_type = typename TestFixture::params::key_type; + using count_type = typename TestFixture::params::count_type; using key_inner_type = typename test_utils::inner_type::type; - using key_distribution_type = typename std::conditional< - std::is_floating_point::value, - std::uniform_real_distribution, - std::uniform_int_distribution - >::type; + using key_distribution_type = + typename std::conditional::value, + std::uniform_real_distribution, + std::uniform_int_distribution>::type; constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; - const bool debug_synchronous = false; + const bool debug_synchronous = false; - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); for(size_t size : get_sizes()) @@ -121,27 +126,26 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, Encode) hipStream_t stream = 0; // default // Generate data and calculate expected results - std::vector unique_expected; + std::vector unique_expected; std::vector counts_expected; - size_t runs_count_expected = 0; + size_t runs_count_expected = 0; - std::vector input(size); - key_distribution_type key_delta_dis(1, 5); + std::vector input(size); + key_distribution_type key_delta_dis(1, 5); std::uniform_int_distribution key_count_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); - std::vector values_input = test_utils::get_random_data(size, 0, 100); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); + std::vector values_input + = test_utils::get_random_data(size, 0, 100); - size_t offset = 0; + size_t offset = 0; key_type current_key = key_distribution_type(0, 100)(gen); while(offset < size) { size_t key_count = key_count_dis(gen); - current_key = current_key + key_delta_dis(gen); + current_key = current_key + key_delta_dis(gen); const size_t end = std::min(size, offset + key_count); - key_count = end - offset; + key_count = end - offset; for(size_t i = offset; i < end; i++) { input[i] = current_key; @@ -154,78 +158,63 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, Encode) offset += key_count; } - key_type * d_input; + key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - key_type * d_unique_output; - count_type * d_counts_output; - count_type * d_runs_count_output; + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + key_type* d_unique_output; + count_type* d_counts_output; + count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_unique_output, runs_count_expected * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count_expected * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); size_t temporary_storage_bytes = 0; - HIP_CHECK( - rocprim::run_length_encode( - nullptr, temporary_storage_bytes, - d_input, size, - test_utils::wrap_in_identity_iterator(d_unique_output), - test_utils::wrap_in_identity_iterator(d_counts_output), - test_utils::wrap_in_identity_iterator(d_runs_count_output), - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::run_length_encode( + nullptr, + temporary_storage_bytes, + d_input, + size, + test_utils::wrap_in_identity_iterator(d_unique_output), + test_utils::wrap_in_identity_iterator(d_counts_output), + test_utils::wrap_in_identity_iterator(d_runs_count_output), + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::run_length_encode( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - test_utils::wrap_in_identity_iterator(d_unique_output), - test_utils::wrap_in_identity_iterator(d_counts_output), - test_utils::wrap_in_identity_iterator(d_runs_count_output), - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::run_length_encode( + d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + test_utils::wrap_in_identity_iterator(d_unique_output), + test_utils::wrap_in_identity_iterator(d_counts_output), + test_utils::wrap_in_identity_iterator(d_runs_count_output), + stream, + debug_synchronous)); HIP_CHECK(hipFree(d_temporary_storage)); - std::vector unique_output(runs_count_expected); + std::vector unique_output(runs_count_expected); std::vector counts_output(runs_count_expected); std::vector runs_count_output(1); - HIP_CHECK( - hipMemcpy( - unique_output.data(), d_unique_output, - runs_count_expected * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); - HIP_CHECK( - hipMemcpy( - counts_output.data(), d_counts_output, - runs_count_expected * sizeof(count_type), - hipMemcpyDeviceToHost - ) - ); - HIP_CHECK( - hipMemcpy( - runs_count_output.data(), d_runs_count_output, - sizeof(count_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(unique_output.data(), + d_unique_output, + runs_count_expected * sizeof(key_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(counts_output.data(), + d_counts_output, + runs_count_expected * sizeof(count_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(runs_count_output.data(), + d_runs_count_output, + sizeof(count_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_unique_output)); @@ -246,20 +235,19 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, Encode) TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns) { - using key_type = typename TestFixture::params::key_type; - using count_type = typename TestFixture::params::count_type; - using offset_type = typename TestFixture::params::count_type; + using key_type = typename TestFixture::params::key_type; + using count_type = typename TestFixture::params::count_type; + using offset_type = typename TestFixture::params::count_type; using key_inner_type = typename test_utils::inner_type::type; - using key_distribution_type = typename std::conditional< - std::is_floating_point::value, - std::uniform_real_distribution, - std::uniform_int_distribution - >::type; + using key_distribution_type = + typename std::conditional::value, + std::uniform_real_distribution, + std::uniform_int_distribution>::type; constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; - const bool debug_synchronous = false; + const bool debug_synchronous = false; - const unsigned int seed = 123; + const unsigned int seed = 123; std::default_random_engine gen(seed); for(size_t size : get_sizes()) @@ -270,19 +258,18 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns) // Generate data and calculate expected results std::vector offsets_expected; - std::vector counts_expected; - size_t runs_count_expected = 0; + std::vector counts_expected; + size_t runs_count_expected = 0; - std::vector input(size); - key_distribution_type key_delta_dis(1, 5); + std::vector input(size); + key_distribution_type key_delta_dis(1, 5); std::uniform_int_distribution key_count_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); std::bernoulli_distribution is_trivial_dis(0.1); - std::vector values_input = test_utils::get_random_data(size, 0, 100); + std::vector values_input + = test_utils::get_random_data(size, 0, 100); - size_t offset = 0; + size_t offset = 0; key_type current_key = key_distribution_type(0, 100)(gen); while(offset < size) { @@ -299,7 +286,7 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns) current_key = current_key + key_delta_dis(gen); const size_t end = std::min(size, offset + key_count); - key_count = end - offset; + key_count = end - offset; for(size_t i = offset; i < end; i++) { input[i] = current_key; @@ -315,81 +302,68 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns) offset += key_count; } - key_type * d_input; + key_type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets_output; - count_type * d_counts_output; - count_type * d_runs_count_output; - HIP_CHECK(hipMalloc(&d_offsets_output, std::max(1, runs_count_expected) * sizeof(offset_type))); - HIP_CHECK(hipMalloc(&d_counts_output, std::max(1, runs_count_expected) * sizeof(count_type))); + HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets_output; + count_type* d_counts_output; + count_type* d_runs_count_output; + HIP_CHECK(hipMalloc(&d_offsets_output, + std::max(1, runs_count_expected) * sizeof(offset_type))); + HIP_CHECK(hipMalloc(&d_counts_output, + std::max(1, runs_count_expected) * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); size_t temporary_storage_bytes = 0; - HIP_CHECK( - rocprim::run_length_encode_non_trivial_runs( - nullptr, temporary_storage_bytes, - d_input, size, - test_utils::wrap_in_identity_iterator(d_offsets_output), - test_utils::wrap_in_identity_iterator(d_counts_output), - test_utils::wrap_in_identity_iterator(d_runs_count_output), - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::run_length_encode_non_trivial_runs( + nullptr, + temporary_storage_bytes, + d_input, + size, + test_utils::wrap_in_identity_iterator(d_offsets_output), + test_utils::wrap_in_identity_iterator(d_counts_output), + test_utils::wrap_in_identity_iterator(d_runs_count_output), + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::run_length_encode_non_trivial_runs( - d_temporary_storage, temporary_storage_bytes, - d_input, size, - test_utils::wrap_in_identity_iterator(d_offsets_output), - test_utils::wrap_in_identity_iterator(d_counts_output), - test_utils::wrap_in_identity_iterator(d_runs_count_output), - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::run_length_encode_non_trivial_runs( + d_temporary_storage, + temporary_storage_bytes, + d_input, + size, + test_utils::wrap_in_identity_iterator(d_offsets_output), + test_utils::wrap_in_identity_iterator(d_counts_output), + test_utils::wrap_in_identity_iterator(d_runs_count_output), + stream, + debug_synchronous)); HIP_CHECK(hipFree(d_temporary_storage)); std::vector offsets_output(runs_count_expected); - std::vector counts_output(runs_count_expected); - std::vector runs_count_output(1); + std::vector counts_output(runs_count_expected); + std::vector runs_count_output(1); if(runs_count_expected > 0) { - HIP_CHECK( - hipMemcpy( - offsets_output.data(), d_offsets_output, - runs_count_expected * sizeof(offset_type), - hipMemcpyDeviceToHost - ) - ); - HIP_CHECK( - hipMemcpy( - counts_output.data(), d_counts_output, - runs_count_expected * sizeof(count_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(offsets_output.data(), + d_offsets_output, + runs_count_expected * sizeof(offset_type), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(counts_output.data(), + d_counts_output, + runs_count_expected * sizeof(count_type), + hipMemcpyDeviceToHost)); } - HIP_CHECK( - hipMemcpy( - runs_count_output.data(), d_runs_count_output, - sizeof(count_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(runs_count_output.data(), + d_runs_count_output, + sizeof(count_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_offsets_output)); diff --git a/test/rocprim/test_device_scan.cpp b/test/rocprim/test_device_scan.cpp index 36b8a9195..9afeac03d 100644 --- a/test/rocprim/test_device_scan.cpp +++ b/test/rocprim/test_device_scan.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include #include +#include +#include // Google Test #include @@ -39,19 +39,17 @@ namespace rp = rocprim; #define HIP_CHECK(error) ASSERT_EQ(error, hipSuccess) // Params for tests -template< - class InputType, - class OutputType = InputType, - class ScanOp = ::rocprim::plus, - // Tests output iterator with void value_type (OutputIterator concept) - // scan-by-key primitives don't support output iterator with void value_type - bool UseIdentityIteratorIfSupported = false -> +template , + // Tests output iterator with void value_type (OutputIterator concept) + // scan-by-key primitives don't support output iterator with void value_type + bool UseIdentityIteratorIfSupported = false> struct DeviceScanParams { - using input_type = InputType; - using output_type = OutputType; - using scan_op_type = ScanOp; + using input_type = InputType; + using output_type = OutputType; + using scan_op_type = ScanOp; static constexpr bool use_identity_iterator = UseIdentityIteratorIfSupported; }; @@ -59,14 +57,14 @@ struct DeviceScanParams // Test for scan ops taking single input value // --------------------------------------------------------- -template +template class RocprimDeviceScanTests : public ::testing::Test { public: - using input_type = typename Params::input_type; - using output_type = typename Params::output_type; - using scan_op_type = typename Params::scan_op_type; - const bool debug_synchronous = false; + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; + using scan_op_type = typename Params::scan_op_type; + const bool debug_synchronous = false; static constexpr bool use_identity_iterator = Params::use_identity_iterator; }; @@ -76,7 +74,7 @@ typedef ::testing::Types< DeviceScanParams, DeviceScanParams, DeviceScanParams, - DeviceScanParams >, + DeviceScanParams>, #ifndef __HIP__ // hip-clang does provide host comparison operators DeviceScanParams, @@ -84,32 +82,28 @@ typedef ::testing::Types< DeviceScanParams, #endif // Large - DeviceScanParams >, - DeviceScanParams >, - DeviceScanParams >, - DeviceScanParams >, - DeviceScanParams >, + DeviceScanParams>, + DeviceScanParams>, + DeviceScanParams>, + DeviceScanParams>, + DeviceScanParams>, DeviceScanParams, true>, - DeviceScanParams >, - DeviceScanParams >, - DeviceScanParams >, - DeviceScanParams< - test_utils::custom_test_type, test_utils::custom_test_type, - rp::plus >, true - >, - DeviceScanParams >, - DeviceScanParams >, - DeviceScanParams > -> RocprimDeviceScanTestsParams; + DeviceScanParams>, + DeviceScanParams>, + DeviceScanParams>, + DeviceScanParams, + test_utils::custom_test_type, + rp::plus>, + true>, + DeviceScanParams>, + DeviceScanParams>, + DeviceScanParams>> + RocprimDeviceScanTestsParams; std::vector get_sizes() { - std::vector sizes = { - 1, 10, 53, 211, - 1024, 2048, 5096, - 34567, (1 << 18), - (1 << 20) - 12345 - }; + std::vector sizes + = {1, 10, 53, 211, 1024, 2048, 5096, 34567, (1 << 18), (1 << 20) - 12345}; const std::vector random_sizes = test_utils::get_random_data(3, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); @@ -120,48 +114,45 @@ TYPED_TEST_CASE(RocprimDeviceScanTests, RocprimDeviceScanTestsParams); TYPED_TEST(RocprimDeviceScanTests, InclusiveScanEmptyInput) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; - using scan_op_type = typename TestFixture::scan_op_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default - U * d_output; + U* d_output; HIP_CHECK(hipMalloc(&d_output, sizeof(U))); - test_utils::out_of_bounds_flag out_of_bounds; + test_utils::out_of_bounds_flag out_of_bounds; test_utils::bounds_checking_iterator d_checking_output( - d_output, - out_of_bounds.device_pointer(), - 0 - ); + d_output, out_of_bounds.device_pointer(), 0); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::inclusive_scan( - d_temp_storage, temp_storage_size_bytes, - rocprim::make_constant_iterator(T(345)), - d_checking_output, - 0, scan_op_type(), stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::inclusive_scan(d_temp_storage, + temp_storage_size_bytes, + rocprim::make_constant_iterator(T(345)), + d_checking_output, + 0, + scan_op_type(), + stream, + debug_synchronous)); // allocate temporary storage HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Run - HIP_CHECK( - rocprim::inclusive_scan( - d_temp_storage, temp_storage_size_bytes, - rocprim::make_constant_iterator(T(345)), - d_checking_output, - 0, scan_op_type(), stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::inclusive_scan(d_temp_storage, + temp_storage_size_bytes, + rocprim::make_constant_iterator(T(345)), + d_checking_output, + 0, + scan_op_type(), + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); @@ -173,10 +164,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanEmptyInput) TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; - using scan_op_type = typename TestFixture::scan_op_type; - const bool debug_synchronous = TestFixture::debug_synchronous; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; + const bool debug_synchronous = TestFixture::debug_synchronous; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; const std::vector sizes = get_sizes(); @@ -190,17 +181,12 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) std::vector input = test_utils::get_random_data(size, 1, 10); std::vector output(input.size(), 0); - T * d_input; - U * d_output; + T* d_input; + U* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // scan function @@ -208,23 +194,21 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) // Calculate expected results on host std::vector expected(input.size()); - test_utils::host_inclusive_scan( - input.begin(), input.end(), - expected.begin(), scan_op - ); + test_utils::host_inclusive_scan(input.begin(), input.end(), expected.begin(), scan_op); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::inclusive_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - input.size(), scan_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::inclusive_scan( + d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + input.size(), + scan_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -234,25 +218,21 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::inclusive_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - input.size(), scan_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::inclusive_scan( + d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + input.size(), + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected @@ -266,10 +246,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; - using scan_op_type = typename TestFixture::scan_op_type; - const bool debug_synchronous = TestFixture::debug_synchronous; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; + const bool debug_synchronous = TestFixture::debug_synchronous; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; const std::vector sizes = get_sizes(); @@ -283,17 +263,12 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) std::vector input = test_utils::get_random_data(size, 1, 10); std::vector output(input.size()); - T * d_input; - U * d_output; + T* d_input; + U* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // scan function @@ -301,25 +276,24 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) // Calculate expected results on host std::vector expected(input.size()); - T initial_value = test_utils::get_random_value(1, 10); + T initial_value = test_utils::get_random_value(1, 10); test_utils::host_exclusive_scan( - input.begin(), input.end(), - initial_value, expected.begin(), - scan_op - ); + input.begin(), input.end(), initial_value, expected.begin(), scan_op); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::exclusive_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - initial_value, input.size(), scan_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::exclusive_scan( + d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + initial_value, + input.size(), + scan_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -329,25 +303,22 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::exclusive_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - initial_value, input.size(), scan_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::exclusive_scan( + d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + initial_value, + input.size(), + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected @@ -362,10 +333,10 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) { // scan-by-key does not support output iterator with void value_type - using T = typename TestFixture::input_type; - using K = unsigned int; // key type - using U = typename TestFixture::output_type; - using scan_op_type = typename TestFixture::scan_op_type; + using T = typename TestFixture::input_type; + using K = unsigned int; // key type + using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; const std::vector sizes = get_sizes(); @@ -391,26 +362,15 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) } std::vector output(input.size(), 0); - T * d_input; - K * d_keys; - U * d_output; + T* d_input; + K* d_keys; + U* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_keys, keys.size() * sizeof(K))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_keys, keys.data(), - keys.size() * sizeof(K), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_keys, keys.data(), keys.size() * sizeof(K), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // scan function @@ -421,41 +381,34 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) // Calculate expected results on host std::vector expected(input.size()); test_utils::host_inclusive_scan( + rocprim::make_zip_iterator(rocprim::make_tuple(input.begin(), keys.begin())), + rocprim::make_zip_iterator(rocprim::make_tuple(input.end(), keys.end())), rocprim::make_zip_iterator( - rocprim::make_tuple(input.begin(), keys.begin()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(input.end(), keys.end()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(expected.begin(), rocprim::make_discard_iterator()) - ), + rocprim::make_tuple(expected.begin(), rocprim::make_discard_iterator())), [scan_op, keys_compare_op](const rocprim::tuple& t1, - const rocprim::tuple& t2) - -> rocprim::tuple - { + const rocprim::tuple& t2) -> rocprim::tuple { if(keys_compare_op(rocprim::get<1>(t1), rocprim::get<1>(t2))) { - return rocprim::make_tuple( - scan_op(rocprim::get<0>(t1), rocprim::get<0>(t2)), - rocprim::get<1>(t2) - ); + return rocprim::make_tuple(scan_op(rocprim::get<0>(t1), rocprim::get<0>(t2)), + rocprim::get<1>(t2)); } return t2; - } - ); + }); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::inclusive_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, input.size(), - scan_op, keys_compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::inclusive_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + input.size(), + scan_op, + keys_compare_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -465,24 +418,22 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::inclusive_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, input.size(), - scan_op, keys_compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::inclusive_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + input.size(), + scan_op, + keys_compare_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected @@ -498,10 +449,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) { // scan-by-key does not support output iterator with void value_type - using T = typename TestFixture::input_type; - using K = unsigned int; // key type - using U = typename TestFixture::output_type; - using scan_op_type = typename TestFixture::scan_op_type; + using T = typename TestFixture::input_type; + using K = unsigned int; // key type + using U = typename TestFixture::output_type; + using scan_op_type = typename TestFixture::scan_op_type; const bool debug_synchronous = TestFixture::debug_synchronous; const std::vector sizes = get_sizes(); @@ -514,8 +465,8 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) const bool use_unique_keys = bool(test_utils::get_random_value(0, 1)); // Generate data - T initial_value = test_utils::get_random_value(1, 100); - std::vector input = test_utils::get_random_data(size, 0, 9); + T initial_value = test_utils::get_random_value(1, 100); + std::vector input = test_utils::get_random_data(size, 0, 9); std::vector keys; if(use_unique_keys) { @@ -528,26 +479,15 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) } std::vector output(input.size(), 0); - T * d_input; - K * d_keys; - U * d_output; + T* d_input; + K* d_keys; + U* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_keys, keys.size() * sizeof(K))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_keys, keys.data(), - keys.size() * sizeof(K), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_keys, keys.data(), keys.size() * sizeof(K), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // scan function @@ -557,23 +497,29 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) // Calculate expected results on host std::vector expected(input.size()); - test_utils::host_exclusive_scan_by_key( - input.begin(), input.end(), keys.begin(), - initial_value, expected.begin(), - scan_op, keys_compare_op - ); + test_utils::host_exclusive_scan_by_key(input.begin(), + input.end(), + keys.begin(), + initial_value, + expected.begin(), + scan_op, + keys_compare_op); // temp storage size_t temp_storage_size_bytes; - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; // Get size of d_temp_storage - HIP_CHECK( - rocprim::exclusive_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, input.size(), - scan_op, keys_compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::exclusive_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + input.size(), + scan_op, + keys_compare_op, + stream, + debug_synchronous)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); @@ -583,24 +529,23 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::exclusive_scan_by_key( - d_temp_storage, temp_storage_size_bytes, - d_keys, d_input, d_output, initial_value, input.size(), - scan_op, keys_compare_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::exclusive_scan_by_key(d_temp_storage, + temp_storage_size_bytes, + d_keys, + d_input, + d_output, + initial_value, + input.size(), + scan_op, + keys_compare_op, + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected diff --git a/test/rocprim/test_device_segmented_radix_sort.cpp b/test/rocprim/test_device_segmented_radix_sort.cpp index 13b630d9f..20b35fa83 100644 --- a/test/rocprim/test_device_segmented_radix_sort.cpp +++ b/test/rocprim/test_device_segmented_radix_sort.cpp @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -42,28 +42,27 @@ namespace rp = rocprim; #define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) -template< - class Key, - class Value, - bool Descending, - unsigned int StartBit, - unsigned int EndBit, - unsigned int MinSegmentLength, - unsigned int MaxSegmentLength -> +template struct params { - using key_type = Key; - using value_type = Value; - static constexpr bool descending = Descending; - static constexpr unsigned int start_bit = StartBit; - static constexpr unsigned int end_bit = EndBit; + using key_type = Key; + using value_type = Value; + static constexpr bool descending = Descending; + static constexpr unsigned int start_bit = StartBit; + static constexpr unsigned int end_bit = EndBit; static constexpr unsigned int min_segment_length = MinSegmentLength; static constexpr unsigned int max_segment_length = MaxSegmentLength; }; -template -class RocprimDeviceSegmentedRadixSort : public ::testing::Test { +template +class RocprimDeviceSegmentedRadixSort : public ::testing::Test +{ public: using params = Params; }; @@ -84,26 +83,27 @@ typedef ::testing::Types< params, params, params, - params, false, 8, 11, 50, 200> -> Params; + params, false, 8, 11, 50, 200>> + Params; TYPED_TEST_CASE(RocprimDeviceSegmentedRadixSort, Params); -template +template struct key_comparator { - static_assert(rp::is_unsigned::value, "Test supports start and end bits only for unsigned integers"); + static_assert(rp::is_unsigned::value, + "Test supports start and end bits only for unsigned integers"); bool operator()(const Key& lhs, const Key& rhs) { auto mask = (1ull << (EndBit - StartBit)) - 1; - auto l = (static_cast(lhs) >> StartBit) & mask; - auto r = (static_cast(rhs) >> StartBit) & mask; + auto l = (static_cast(lhs) >> StartBit) & mask; + auto r = (static_cast(rhs) >> StartBit) & mask; return Descending ? (r < l) : (l < r); } }; -template +template struct key_comparator { bool operator()(const Key& lhs, const Key& rhs) @@ -112,7 +112,7 @@ struct key_comparator } }; -template +template struct key_comparator { bool operator()(const rp::half& lhs, const rp::half& rhs) @@ -122,7 +122,7 @@ struct key_comparator } }; -template +template struct key_value_comparator { bool operator()(const std::pair& lhs, const std::pair& rhs) @@ -134,12 +134,7 @@ struct key_value_comparator std::vector get_sizes() { std::vector sizes = { - 1024, 2048, 4096, 1792, - 1, 10, 53, 211, 500, - 2345, 11001, 34567, - 1000000, - (1 << 16) - 1220 - }; + 1024, 2048, 4096, 1792, 1, 10, 53, 211, 500, 2345, 11001, 34567, 1000000, (1 << 16) - 1220}; const std::vector random_sizes = test_utils::get_random_data(5, 1, 100000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); return sizes; @@ -147,10 +142,10 @@ std::vector get_sizes() TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeys) { - using key_type = typename TestFixture::params::key_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; + using key_type = typename TestFixture::params::key_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; using offset_type = unsigned int; @@ -158,13 +153,11 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeys) const bool debug_synchronous = false; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_int_distribution segment_length_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); const std::vector sizes = get_sizes(); for(size_t size : sizes) @@ -175,20 +168,18 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeys) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_dis(gen); @@ -198,87 +189,80 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeys) } offsets.push_back(size); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); // Calculate expected results on host std::vector expected(keys_input); for(size_t i = 0; i < segments_count; i++) { - std::stable_sort( - expected.begin() + offsets[i], - expected.begin() + offsets[i + 1], - key_comparator() - ); + std::stable_sort(expected.begin() + offsets[i], + expected.begin() + offsets[i + 1], + key_comparator()); } size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::segmented_radix_sort_keys( - nullptr, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys(nullptr, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit)); ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(descending) { - HIP_CHECK( - rp::segmented_radix_sort_keys_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::segmented_radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys_output, - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys_output, size * sizeof(key_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); @@ -291,11 +275,11 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeys) TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairs) { - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; using offset_type = unsigned int; @@ -303,13 +287,11 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairs) const bool debug_synchronous = false; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_int_distribution segment_length_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); const std::vector sizes = get_sizes(); for(size_t size : sizes) @@ -320,20 +302,18 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairs) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_dis(gen); @@ -346,39 +326,26 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairs) std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets; + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); using key_value = std::pair; @@ -393,27 +360,30 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairs) std::stable_sort( expected.begin() + offsets[i], expected.begin() + offsets[i + 1], - key_value_comparator() - ); + key_value_comparator()); } - std::vector keys_expected(size); + std::vector keys_expected(size); std::vector values_expected(size); for(size_t i = 0; i < size; i++) { - keys_expected[i] = expected[i].first; + keys_expected[i] = expected[i].first; values_expected[i] = expected[i].second; } - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::segmented_radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit)); ASSERT_GT(temporary_storage_bytes, 0U); @@ -421,46 +391,48 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairs) if(descending) { - HIP_CHECK( - rp::segmented_radix_sort_pairs_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::segmented_radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, d_keys_output, d_values_input, d_values_output, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_keys_output, + d_values_input, + d_values_output, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys_output, - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys_output, size * sizeof(key_type), hipMemcpyDeviceToHost)); std::vector values_output(size); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values_output, - size * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values_output, + size * sizeof(value_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); @@ -476,10 +448,10 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairs) TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeysDoubleBuffer) { - using key_type = typename TestFixture::params::key_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; + using key_type = typename TestFixture::params::key_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; using offset_type = unsigned int; @@ -487,13 +459,11 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeysDoubleBuffer) const bool debug_synchronous = false; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_int_distribution segment_length_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); const std::vector sizes = get_sizes(); for(size_t size : sizes) @@ -504,20 +474,18 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeysDoubleBuffer) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_dis(gen); @@ -527,37 +495,27 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeysDoubleBuffer) } offsets.push_back(size); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); // Calculate expected results on host std::vector expected(keys_input); for(size_t i = 0; i < segments_count; i++) { - std::stable_sort( - expected.begin() + offsets[i], - expected.begin() + offsets[i + 1], - key_comparator() - ); + std::stable_sort(expected.begin() + offsets[i], + expected.begin() + offsets[i + 1], + key_comparator()); } rp::double_buffer d_keys(d_keys_input, d_keys_output); @@ -566,53 +524,53 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeysDoubleBuffer) using config = rp::segmented_radix_sort_config<7, 4, rp::kernel_config<192, 5>>; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::segmented_radix_sort_keys( - nullptr, temporary_storage_bytes, - d_keys, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys(nullptr, + temporary_storage_bytes, + d_keys, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit)); ASSERT_GT(temporary_storage_bytes, 0U); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); if(descending) { - HIP_CHECK( - rp::segmented_radix_sort_keys_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::segmented_radix_sort_keys( - d_temporary_storage, temporary_storage_bytes, - d_keys, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, + temporary_storage_bytes, + d_keys, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys.current(), - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys.current(), size * sizeof(key_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); @@ -625,11 +583,11 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeysDoubleBuffer) TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairsDoubleBuffer) { - using key_type = typename TestFixture::params::key_type; - using value_type = typename TestFixture::params::value_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; + using key_type = typename TestFixture::params::key_type; + using value_type = typename TestFixture::params::value_type; + constexpr bool descending = TestFixture::params::descending; + constexpr unsigned int start_bit = TestFixture::params::start_bit; + constexpr unsigned int end_bit = TestFixture::params::end_bit; using offset_type = unsigned int; @@ -637,13 +595,11 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairsDoubleBuffer) const bool debug_synchronous = false; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_int_distribution segment_length_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); const std::vector sizes = get_sizes(); for(size_t size : sizes) @@ -654,20 +610,18 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairsDoubleBuffer) std::vector keys_input; if(rp::is_floating_point::value) { - keys_input = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000); + keys_input + = test_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); } else { keys_input = test_utils::get_random_data( - size, - std::numeric_limits::min(), - std::numeric_limits::max() - ); + size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_dis(gen); @@ -680,39 +634,26 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairsDoubleBuffer) std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); - key_type * d_keys_input; - key_type * d_keys_output; + key_type* d_keys_input; + key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - size * sizeof(key_type), - hipMemcpyHostToDevice - ) - ); - - value_type * d_values_input; - value_type * d_values_output; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); + + value_type* d_values_input; + value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(value_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets; + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); using key_value = std::pair; @@ -727,30 +668,31 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairsDoubleBuffer) std::stable_sort( expected.begin() + offsets[i], expected.begin() + offsets[i + 1], - key_value_comparator() - ); + key_value_comparator()); } - std::vector keys_expected(size); + std::vector keys_expected(size); std::vector values_expected(size); for(size_t i = 0; i < size; i++) { - keys_expected[i] = expected[i].first; + keys_expected[i] = expected[i].first; values_expected[i] = expected[i].second; } - rp::double_buffer d_keys(d_keys_input, d_keys_output); + rp::double_buffer d_keys(d_keys_input, d_keys_output); rp::double_buffer d_values(d_values_input, d_values_output); - void * d_temporary_storage = nullptr; + void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; - HIP_CHECK( - rp::segmented_radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys, d_values, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys, + d_values, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit)); ASSERT_GT(temporary_storage_bytes, 0U); @@ -758,46 +700,44 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortPairsDoubleBuffer) if(descending) { - HIP_CHECK( - rp::segmented_radix_sort_pairs_desc( - d_temporary_storage, temporary_storage_bytes, - d_keys, d_values, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs_desc(d_temporary_storage, + temporary_storage_bytes, + d_keys, + d_values, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } else { - HIP_CHECK( - rp::segmented_radix_sort_pairs( - d_temporary_storage, temporary_storage_bytes, - d_keys, d_values, size, - segments_count, d_offsets, d_offsets + 1, - start_bit, end_bit, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, + temporary_storage_bytes, + d_keys, + d_values, + size, + segments_count, + d_offsets, + d_offsets + 1, + start_bit, + end_bit, + stream, + debug_synchronous)); } std::vector keys_output(size); - HIP_CHECK( - hipMemcpy( - keys_output.data(), d_keys.current(), - size * sizeof(key_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + keys_output.data(), d_keys.current(), size * sizeof(key_type), hipMemcpyDeviceToHost)); std::vector values_output(size); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values.current(), - size * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values.current(), + size * sizeof(value_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); diff --git a/test/rocprim/test_device_segmented_reduce.cpp b/test/rocprim/test_device_segmented_reduce.cpp index 7f576b9ae..395b5c85e 100644 --- a/test/rocprim/test_device_segmented_reduce.cpp +++ b/test/rocprim/test_device_segmented_reduce.cpp @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -37,40 +37,39 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; -template< - class Input, - class Output, - class ReduceOp = ::rocprim::plus, - int Init = 0, // as only integral types supported, int is used here even for floating point inputs - unsigned int MinSegmentLength = 0, - unsigned int MaxSegmentLength = 1000, - // Tests output iterator with void value_type (OutputIterator concept) - bool UseIdentityIterator = false -> +template , + int Init + = 0, // as only integral types supported, int is used here even for floating point inputs + unsigned int MinSegmentLength = 0, + unsigned int MaxSegmentLength = 1000, + // Tests output iterator with void value_type (OutputIterator concept) + bool UseIdentityIterator = false> struct params { - using input_type = Input; - using output_type = Output; - using reduce_op_type = ReduceOp; - static constexpr int init = Init; - static constexpr unsigned int min_segment_length = MinSegmentLength; - static constexpr unsigned int max_segment_length = MaxSegmentLength; - static constexpr bool use_identity_iterator = UseIdentityIterator; + using input_type = Input; + using output_type = Output; + using reduce_op_type = ReduceOp; + static constexpr int init = Init; + static constexpr unsigned int min_segment_length = MinSegmentLength; + static constexpr unsigned int max_segment_length = MaxSegmentLength; + static constexpr bool use_identity_iterator = UseIdentityIterator; }; -template -class RocprimDeviceSegmentedReduce : public ::testing::Test { +template +class RocprimDeviceSegmentedReduce : public ::testing::Test +{ public: using params = Params; }; -using custom_short2 = test_utils::custom_test_type; -using custom_int2 = test_utils::custom_test_type; +using custom_short2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< @@ -86,20 +85,15 @@ typedef ::testing::Types< // hip-clang does not allow to convert half to float params, 0, 10, 300>, #endif - params -> Params; + params> + Params; TYPED_TEST_CASE(RocprimDeviceSegmentedReduce, Params); std::vector get_sizes() { std::vector sizes = { - 1024, 2048, 4096, 1792, - 1, 10, 53, 211, 500, - 2345, 11001, 34567, - 100000, - (1 << 16) - 1220 - }; + 1024, 2048, 4096, 1792, 1, 10, 53, 211, 500, 2345, 11001, 34567, 100000, (1 << 16) - 1220}; const std::vector random_sizes = test_utils::get_random_data(5, 1, 1000000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); return sizes; @@ -107,24 +101,22 @@ std::vector get_sizes() TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce) { - using input_type = typename TestFixture::params::input_type; - using output_type = typename TestFixture::params::output_type; - using reduce_op_type = typename TestFixture::params::reduce_op_type; + using input_type = typename TestFixture::params::input_type; + using output_type = typename TestFixture::params::output_type; + using reduce_op_type = typename TestFixture::params::reduce_op_type; constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; using result_type = output_type; using offset_type = unsigned int; - const input_type init = TestFixture::params::init; - const bool debug_synchronous = false; - reduce_op_type reduce_op; + const input_type init = TestFixture::params::init; + const bool debug_synchronous = false; + reduce_op_type reduce_op; - std::random_device rd; - std::default_random_engine gen(rd()); + std::random_device rd; + std::default_random_engine gen(rd()); std::uniform_int_distribution segment_length_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); for(size_t size : get_sizes()) { @@ -135,18 +127,19 @@ TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce) // Generate data and calculate expected results std::vector aggregates_expected; - std::vector values_input = test_utils::get_random_data(size, 0, 100); + std::vector values_input + = test_utils::get_random_data(size, 0, 100); std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const size_t end = std::min(size, offset + segment_length); - result_type aggregate = init; + const size_t end = std::min(size, offset + segment_length); + result_type aggregate = init; for(size_t i = offset; i < end; i++) { aggregate = reduce_op(aggregate, values_input[i]); @@ -158,74 +151,66 @@ TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce) } offsets.push_back(size); - input_type * d_values_input; + input_type* d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(input_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(input_type), - hipMemcpyHostToDevice - ) - ); - - offset_type * d_offsets; + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(input_type), hipMemcpyHostToDevice)); + + offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); - - output_type * d_aggregates_output; + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); + + output_type* d_aggregates_output; HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(output_type))); size_t temporary_storage_bytes; - HIP_CHECK( - rp::segmented_reduce( - nullptr, temporary_storage_bytes, - d_values_input, d_aggregates_output, - segments_count, - d_offsets, d_offsets + 1, - reduce_op, init, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_reduce(nullptr, + temporary_storage_bytes, + d_values_input, + d_aggregates_output, + segments_count, + d_offsets, + d_offsets + 1, + reduce_op, + init, + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rp::segmented_reduce( - d_temporary_storage, temporary_storage_bytes, - d_values_input, - test_utils::wrap_in_identity_iterator(d_aggregates_output), - segments_count, - d_offsets, d_offsets + 1, - reduce_op, init, - stream, debug_synchronous - ) - ); + HIP_CHECK(rp::segmented_reduce( + d_temporary_storage, + temporary_storage_bytes, + d_values_input, + test_utils::wrap_in_identity_iterator(d_aggregates_output), + segments_count, + d_offsets, + d_offsets + 1, + reduce_op, + init, + stream, + debug_synchronous)); HIP_CHECK(hipFree(d_temporary_storage)); std::vector aggregates_output(segments_count); - HIP_CHECK( - hipMemcpy( - aggregates_output.data(), d_aggregates_output, - segments_count * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(aggregates_output.data(), + d_aggregates_output, + segments_count * sizeof(output_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_aggregates_output)); - ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(aggregates_output, aggregates_expected, 0.01f)); + ASSERT_NO_FATAL_FAILURE( + test_utils::assert_near(aggregates_output, aggregates_expected, 0.01f)); } } diff --git a/test/rocprim/test_device_segmented_scan.cpp b/test/rocprim/test_device_segmented_scan.cpp index 154061b46..92d1b6fb3 100644 --- a/test/rocprim/test_device_segmented_scan.cpp +++ b/test/rocprim/test_device_segmented_scan.cpp @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include // Google Test #include @@ -41,37 +41,37 @@ namespace rp = rocprim; -template< - class Input, - class Output, - class ScanOp = ::rocprim::plus, - int Init = 0, // as only integral types supported, int is used here even for floating point inputs - unsigned int MinSegmentLength = 0, - unsigned int MaxSegmentLength = 1000, - // Tests output iterator with void value_type (OutputIterator concept) - // Segmented scan primitives which use head flags do not support this kind - // of output iterators. - bool UseIdentityIterator = false -> +template , + int Init + = 0, // as only integral types supported, int is used here even for floating point inputs + unsigned int MinSegmentLength = 0, + unsigned int MaxSegmentLength = 1000, + // Tests output iterator with void value_type (OutputIterator concept) + // Segmented scan primitives which use head flags do not support this kind + // of output iterators. + bool UseIdentityIterator = false> struct params { - using input_type = Input; - using output_type = Output; - using scan_op_type = ScanOp; - static constexpr int init = Init; - static constexpr unsigned int min_segment_length = MinSegmentLength; - static constexpr unsigned int max_segment_length = MaxSegmentLength; - static constexpr bool use_identity_iterator = UseIdentityIterator; + using input_type = Input; + using output_type = Output; + using scan_op_type = ScanOp; + static constexpr int init = Init; + static constexpr unsigned int min_segment_length = MinSegmentLength; + static constexpr unsigned int max_segment_length = MaxSegmentLength; + static constexpr bool use_identity_iterator = UseIdentityIterator; }; -template -class RocprimDeviceSegmentedScan : public ::testing::Test { +template +class RocprimDeviceSegmentedScan : public ::testing::Test +{ public: using params = Params; }; -using custom_short2 = test_utils::custom_test_type; -using custom_int2 = test_utils::custom_test_type; +using custom_short2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; typedef ::testing::Types< @@ -87,19 +87,15 @@ typedef ::testing::Types< // hip-clang does provide host comparison operators params, #endif - params, 10, 3000, 4000> -> Params; + params, 10, 3000, 4000>> + Params; TYPED_TEST_CASE(RocprimDeviceSegmentedScan, Params); std::vector get_sizes() { - std::vector sizes = { - 1024, 2048, 4096, 1792, - 1, 10, 53, 211, 500, - 2345, 11001, 34567, - (1 << 16) - 1220 - }; + std::vector sizes + = {1024, 2048, 4096, 1792, 1, 10, 53, 211, 500, 2345, 11001, 34567, (1 << 16) - 1220}; const std::vector random_sizes = test_utils::get_random_data(2, 1, 1000000); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); return sizes; @@ -107,24 +103,21 @@ std::vector get_sizes() TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScan) { - using input_type = typename TestFixture::params::input_type; - using output_type = typename TestFixture::params::output_type; - using scan_op_type = typename TestFixture::params::scan_op_type; - static constexpr bool use_identity_iterator = - TestFixture::params::use_identity_iterator; - using result_type = output_type; - - using offset_type = unsigned int; - const bool debug_synchronous = false; + using input_type = typename TestFixture::params::input_type; + using output_type = typename TestFixture::params::output_type; + using scan_op_type = typename TestFixture::params::scan_op_type; + static constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; + using result_type = output_type; + + using offset_type = unsigned int; + const bool debug_synchronous = false; scan_op_type scan_op; - std::random_device rd; + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_int_distribution segment_length_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); hipStream_t stream = 0; // default stream @@ -135,22 +128,23 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScan) // Generate data and calculate expected results std::vector values_expected(size); - std::vector values_input = test_utils::get_random_data(size, 0, 100); + std::vector values_input + = test_utils::get_random_data(size, 0, 100); std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const size_t end = std::min(size, offset + segment_length); - result_type aggregate = values_input[offset]; + const size_t end = std::min(size, offset + segment_length); + result_type aggregate = values_input[offset]; values_expected[offset] = aggregate; for(size_t i = offset + 1; i < end; i++) { - aggregate = scan_op(aggregate, values_input[i]); + aggregate = scan_op(aggregate, values_input[i]); values_expected[i] = aggregate; } @@ -159,66 +153,55 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScan) } offsets.push_back(size); - input_type * d_values_input; - offset_type * d_offsets; - output_type * d_values_output; + input_type* d_values_input; + offset_type* d_offsets; + output_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(input_type))); HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(input_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(input_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); size_t temporary_storage_bytes; - HIP_CHECK( - rocprim::segmented_inclusive_scan( - nullptr, temporary_storage_bytes, - d_values_input, - test_utils::wrap_in_identity_iterator(d_values_output), - segments_count, - d_offsets, d_offsets + 1, - scan_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_inclusive_scan( + nullptr, + temporary_storage_bytes, + d_values_input, + test_utils::wrap_in_identity_iterator(d_values_output), + segments_count, + d_offsets, + d_offsets + 1, + scan_op, + stream, + debug_synchronous)); ASSERT_GT(temporary_storage_bytes, 0); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::segmented_inclusive_scan( - d_temporary_storage, temporary_storage_bytes, - d_values_input, - test_utils::wrap_in_identity_iterator(d_values_output), - segments_count, - d_offsets, d_offsets + 1, - scan_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_inclusive_scan( + d_temporary_storage, + temporary_storage_bytes, + d_values_input, + test_utils::wrap_in_identity_iterator(d_values_output), + segments_count, + d_offsets, + d_offsets + 1, + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); std::vector values_output(size); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values_output, - values_output.size() * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values_output, + values_output.size() * sizeof(output_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(values_output, values_expected, 0.01f)); @@ -232,25 +215,22 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScan) TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScan) { - using input_type = typename TestFixture::params::input_type; - using output_type = typename TestFixture::params::output_type; - using scan_op_type = typename TestFixture::params::scan_op_type; - static constexpr bool use_identity_iterator = - TestFixture::params::use_identity_iterator; - using result_type = output_type; - using offset_type = unsigned int; - - const input_type init = TestFixture::params::init; - const bool debug_synchronous = false; - scan_op_type scan_op; - - std::random_device rd; + using input_type = typename TestFixture::params::input_type; + using output_type = typename TestFixture::params::output_type; + using scan_op_type = typename TestFixture::params::scan_op_type; + static constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; + using result_type = output_type; + using offset_type = unsigned int; + + const input_type init = TestFixture::params::init; + const bool debug_synchronous = false; + scan_op_type scan_op; + + std::random_device rd; std::default_random_engine gen(rd()); std::uniform_int_distribution segment_length_dis( - TestFixture::params::min_segment_length, - TestFixture::params::max_segment_length - ); + TestFixture::params::min_segment_length, TestFixture::params::max_segment_length); hipStream_t stream = 0; // default stream @@ -261,22 +241,23 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScan) // Generate data and calculate expected results std::vector values_expected(size); - std::vector values_input = test_utils::get_random_data(size, 0, 100); + std::vector values_input + = test_utils::get_random_data(size, 0, 100); std::vector offsets; - unsigned int segments_count = 0; - size_t offset = 0; + unsigned int segments_count = 0; + size_t offset = 0; while(offset < size) { const size_t segment_length = segment_length_dis(gen); offsets.push_back(offset); - const size_t end = std::min(size, offset + segment_length); - result_type aggregate = init; + const size_t end = std::min(size, offset + segment_length); + result_type aggregate = init; values_expected[offset] = aggregate; for(size_t i = offset + 1; i < end; i++) { - aggregate = scan_op(aggregate, values_input[i-1]); + aggregate = scan_op(aggregate, values_input[i - 1]); values_expected[i] = aggregate; } @@ -285,67 +266,58 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScan) } offsets.push_back(size); - input_type * d_values_input; - offset_type * d_offsets; - output_type * d_values_output; + input_type* d_values_input; + offset_type* d_offsets; + output_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(input_type))); HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - size * sizeof(input_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_offsets, offsets.data(), - (segments_count + 1) * sizeof(offset_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_values_input, values_input.data(), size * sizeof(input_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_offsets, + offsets.data(), + (segments_count + 1) * sizeof(offset_type), + hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); size_t temporary_storage_bytes; - HIP_CHECK( - rocprim::segmented_exclusive_scan( - nullptr, temporary_storage_bytes, - d_values_input, - test_utils::wrap_in_identity_iterator(d_values_output), - segments_count, - d_offsets, d_offsets + 1, - init, scan_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_exclusive_scan( + nullptr, + temporary_storage_bytes, + d_values_input, + test_utils::wrap_in_identity_iterator(d_values_output), + segments_count, + d_offsets, + d_offsets + 1, + init, + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_GT(temporary_storage_bytes, 0); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::segmented_exclusive_scan( - d_temporary_storage, temporary_storage_bytes, - d_values_input, - test_utils::wrap_in_identity_iterator(d_values_output), - segments_count, - d_offsets, d_offsets + 1, - init, scan_op, - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_exclusive_scan( + d_temporary_storage, + temporary_storage_bytes, + d_values_input, + test_utils::wrap_in_identity_iterator(d_values_output), + segments_count, + d_offsets, + d_offsets + 1, + init, + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); std::vector values_output(size); - HIP_CHECK( - hipMemcpy( - values_output.data(), d_values_output, - values_output.size() * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(values_output.data(), + d_values_output, + values_output.size() * sizeof(output_type), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(values_output, values_expected, 0.01f)); @@ -360,10 +332,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScan) TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScanUsingHeadFlags) { // Does not support output iterator with void value_type - using input_type = typename TestFixture::params::input_type; - using flag_type = unsigned int; - using output_type = typename TestFixture::params::output_type; - using scan_op_type = typename TestFixture::params::scan_op_type; + using input_type = typename TestFixture::params::input_type; + using flag_type = unsigned int; + using output_type = typename TestFixture::params::output_type; + using scan_op_type = typename TestFixture::params::scan_op_type; const bool debug_synchronous = false; hipStream_t stream = 0; // default stream @@ -375,33 +347,24 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScanUsingHeadFlags) // Generate data std::vector input = test_utils::get_random_data(size, 1, 10); - std::vector flags = test_utils::get_random_data(size, 0, 10); - flags[0] = 1U; - std::transform( - flags.begin(), flags.end(), flags.begin(), - [](flag_type a){ if(a == 1U) return 1U; return 0U; } - ); - - input_type * d_input; - flag_type * d_flags; - output_type * d_output; + std::vector flags = test_utils::get_random_data(size, 0, 10); + flags[0] = 1U; + std::transform(flags.begin(), flags.end(), flags.begin(), [](flag_type a) { + if(a == 1U) + return 1U; + return 0U; + }); + + input_type* d_input; + flag_type* d_flags; + output_type* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input_type))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(flag_type))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(input_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(flag_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_input, input.data(), input.size() * sizeof(input_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_flags, flags.data(), flags.size() * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // scan function @@ -410,71 +373,59 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScanUsingHeadFlags) // Calculate expected results on host std::vector expected(input.size()); test_utils::host_inclusive_scan( + rocprim::make_zip_iterator(rocprim::make_tuple(input.begin(), flags.begin())), + rocprim::make_zip_iterator(rocprim::make_tuple(input.end(), flags.end())), rocprim::make_zip_iterator( - rocprim::make_tuple(input.begin(), flags.begin()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(input.end(), flags.end()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(expected.begin(), rocprim::make_discard_iterator()) - ), + rocprim::make_tuple(expected.begin(), rocprim::make_discard_iterator())), [scan_op](const rocprim::tuple& t1, const rocprim::tuple& t2) - -> rocprim::tuple - { + -> rocprim::tuple { if(!rocprim::get<1>(t2)) { - return rocprim::make_tuple( - scan_op(rocprim::get<0>(t1), rocprim::get<0>(t2)), - rocprim::get<1>(t1) + rocprim::get<1>(t2) - ); + return rocprim::make_tuple(scan_op(rocprim::get<0>(t1), rocprim::get<0>(t2)), + rocprim::get<1>(t1) + rocprim::get<1>(t2)); } return t2; - } - ); + }); // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::segmented_inclusive_scan( - nullptr, temp_storage_size_bytes, - d_input, d_output, d_flags, - input.size(), scan_op, stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_inclusive_scan(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_flags, + input.size(), + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::segmented_inclusive_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, d_flags, - input.size(), scan_op, stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_inclusive_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_flags, + input.size(), + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected std::vector output(input.size()); - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, output.size() * sizeof(output_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01f)); @@ -489,12 +440,12 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScanUsingHeadFlags) TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScanUsingHeadFlags) { // Does not support output iterator with void value_type - using input_type = typename TestFixture::params::input_type; - using flag_type = unsigned int; - using output_type = typename TestFixture::params::output_type; - using scan_op_type = typename TestFixture::params::scan_op_type; - const input_type init = TestFixture::params::init; - const bool debug_synchronous = false; + using input_type = typename TestFixture::params::input_type; + using flag_type = unsigned int; + using output_type = typename TestFixture::params::output_type; + using scan_op_type = typename TestFixture::params::scan_op_type; + const input_type init = TestFixture::params::init; + const bool debug_synchronous = false; hipStream_t stream = 0; // default stream @@ -505,33 +456,24 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScanUsingHeadFlags) // Generate data std::vector input = test_utils::get_random_data(size, 1, 10); - std::vector flags = test_utils::get_random_data(size, 0, 10); - flags[0] = 1U; - std::transform( - flags.begin(), flags.end(), flags.begin(), - [](flag_type a){ if(a == 1U) return 1U; return 0U; } - ); - - input_type * d_input; - flag_type * d_flags; - output_type * d_output; + std::vector flags = test_utils::get_random_data(size, 0, 10); + flags[0] = 1U; + std::transform(flags.begin(), flags.end(), flags.begin(), [](flag_type a) { + if(a == 1U) + return 1U; + return 0U; + }); + + input_type* d_input; + flag_type* d_flags; + output_type* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input_type))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(flag_type))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(output_type))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(input_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(flag_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + d_input, input.data(), input.size() * sizeof(input_type), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + d_flags, flags.data(), flags.size() * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // scan function @@ -543,93 +485,75 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScanUsingHeadFlags) // This shifts input one to the right and initializes segments with init. expected[0] = init; std::transform( + rocprim::make_zip_iterator(rocprim::make_tuple(input.begin(), flags.begin() + 1)), + rocprim::make_zip_iterator(rocprim::make_tuple(input.end() - 1, flags.end())), rocprim::make_zip_iterator( - rocprim::make_tuple(input.begin(), flags.begin()+1) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(input.end() - 1, flags.end()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(expected.begin() + 1, rocprim::make_discard_iterator()) - ), + rocprim::make_tuple(expected.begin() + 1, rocprim::make_discard_iterator())), [init](const rocprim::tuple& t) - -> rocprim::tuple - { + -> rocprim::tuple { if(rocprim::get<1>(t)) { - return rocprim::make_tuple( - init, - rocprim::get<1>(t) - ); + return rocprim::make_tuple(init, rocprim::get<1>(t)); } return t; - } - ); + }); // Now we can run inclusive scan and get segmented exclusive results test_utils::host_inclusive_scan( + rocprim::make_zip_iterator(rocprim::make_tuple(expected.begin(), flags.begin())), + rocprim::make_zip_iterator(rocprim::make_tuple(expected.end(), flags.end())), rocprim::make_zip_iterator( - rocprim::make_tuple(expected.begin(), flags.begin()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(expected.end(), flags.end()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(expected.begin(), rocprim::make_discard_iterator()) - ), + rocprim::make_tuple(expected.begin(), rocprim::make_discard_iterator())), [scan_op](const rocprim::tuple& t1, const rocprim::tuple& t2) - -> rocprim::tuple - { + -> rocprim::tuple { if(!rocprim::get<1>(t2)) { - return rocprim::make_tuple( - scan_op(rocprim::get<0>(t1), rocprim::get<0>(t2)), - rocprim::get<1>(t1) + rocprim::get<1>(t2) - ); + return rocprim::make_tuple(scan_op(rocprim::get<0>(t1), rocprim::get<0>(t2)), + rocprim::get<1>(t1) + rocprim::get<1>(t2)); } return t2; - } - ); + }); // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::segmented_exclusive_scan( - nullptr, temp_storage_size_bytes, - d_input, d_output, d_flags, init, - input.size(), scan_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_exclusive_scan(nullptr, + temp_storage_size_bytes, + d_input, + d_output, + d_flags, + init, + input.size(), + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::segmented_exclusive_scan( - d_temp_storage, temp_storage_size_bytes, - d_input, d_output, d_flags, init, - input.size(), scan_op, stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::segmented_exclusive_scan(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_output, + d_flags, + init, + input.size(), + scan_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected std::vector output(input.size()); - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(output_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, output.size() * sizeof(output_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_NO_FATAL_FAILURE(test_utils::assert_near(output, expected, 0.01f)); diff --git a/test/rocprim/test_device_select.cpp b/test/rocprim/test_device_select.cpp index e081a28b9..ceaa2991a 100644 --- a/test/rocprim/test_device_select.cpp +++ b/test/rocprim/test_device_select.cpp @@ -20,9 +20,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include #include #include -#include // Google Test #include @@ -34,49 +34,44 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) // Params for tests -template< - class InputType, - class OutputType = InputType, - class FlagType = unsigned int, - bool UseIdentityIterator = false -> +template struct DeviceSelectParams { - using input_type = InputType; - using output_type = OutputType; - using flag_type = FlagType; + using input_type = InputType; + using output_type = OutputType; + using flag_type = FlagType; static constexpr bool use_identity_iterator = UseIdentityIterator; }; -template +template class RocprimDeviceSelectTests : public ::testing::Test { public: - using input_type = typename Params::input_type; - using output_type = typename Params::output_type; - using flag_type = typename Params::flag_type; - const bool debug_synchronous = false; + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; + using flag_type = typename Params::flag_type; + const bool debug_synchronous = false; static constexpr bool use_identity_iterator = Params::use_identity_iterator; }; -typedef ::testing::Types< - DeviceSelectParams, - DeviceSelectParams, - DeviceSelectParams, - DeviceSelectParams, test_utils::custom_test_type, int, true> -> RocprimDeviceSelectTestsParams; +typedef ::testing::Types, + DeviceSelectParams, + DeviceSelectParams, + DeviceSelectParams, + test_utils::custom_test_type, + int, + true>> + RocprimDeviceSelectTestsParams; std::vector get_sizes() { - std::vector sizes = { - 2, 32, 64, 256, - 1024, 2048, - 3072, 4096, - 27845, (1 << 18) + 1111 - }; + std::vector sizes = {2, 32, 64, 256, 1024, 2048, 3072, 4096, 27845, (1 << 18) + 1111}; const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); @@ -87,11 +82,11 @@ TYPED_TEST_CASE(RocprimDeviceSelectTests, RocprimDeviceSelectTestsParams); TYPED_TEST(RocprimDeviceSelectTests, Flagged) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; - using F = typename TestFixture::flag_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; + using F = typename TestFixture::flag_type; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; - const bool debug_synchronous = TestFixture::debug_synchronous; + const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream @@ -104,28 +99,18 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) std::vector input = test_utils::get_random_data(size, 1, 100); std::vector flags = test_utils::get_random_data(size, 0, 1); - T * d_input; - F * d_flags; - U * d_output; - unsigned int * d_selected_count_output; + T* d_input; + F* d_flags; + U* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(F))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(U))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_flags, flags.data(), - flags.size() * sizeof(F), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host @@ -143,65 +128,51 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK( - rocprim::select( - nullptr, - temp_storage_size_bytes, - d_input, - d_flags, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - stream, - debug_synchronous - ) - ); + rocprim::select(nullptr, + temp_storage_size_bytes, + d_input, + d_flags, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run HIP_CHECK( - rocprim::select( - d_temp_storage, - temp_storage_size_bytes, - d_input, - d_flags, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - stream, - debug_synchronous - ) - ); + rocprim::select(d_temp_storage, + temp_storage_size_bytes, + d_input, + d_flags, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if number of selected value is as expected unsigned int selected_count_output = 0; - HIP_CHECK( - hipMemcpy( - &selected_count_output, d_selected_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_EQ(selected_count_output, expected.size()); // Check if output values are as expected std::vector output(input.size()); HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < expected.size(); i++) { @@ -218,18 +189,18 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) TYPED_TEST(RocprimDeviceSelectTests, SelectOp) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; - const bool debug_synchronous = TestFixture::debug_synchronous; + const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream - auto select_op = [] __host__ __device__ (const T& value) -> bool - { - if(value < T(50)) return true; - return false; - }; + auto select_op = [] __host__ __device__(const T& value) -> bool { + if(value < T(50)) + return true; + return false; + }; const std::vector sizes = get_sizes(); for(auto size : sizes) @@ -239,19 +210,14 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) // Generate data std::vector input = test_utils::get_random_data(size, 0, 100); - T * d_input; - U * d_output; - unsigned int * d_selected_count_output; + T* d_input; + U* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(U))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host @@ -269,65 +235,51 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK( - rocprim::select( - nullptr, - temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - select_op, - stream, - debug_synchronous - ) - ); + rocprim::select(nullptr, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run HIP_CHECK( - rocprim::select( - d_temp_storage, - temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - select_op, - stream, - debug_synchronous - ) - ); + rocprim::select(d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + select_op, + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if number of selected value is as expected unsigned int selected_count_output = 0; - HIP_CHECK( - hipMemcpy( - &selected_count_output, d_selected_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_EQ(selected_count_output, expected.size()); // Check if output values are as expected std::vector output(input.size()); HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < expected.size(); i++) { @@ -343,67 +295,54 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) std::vector get_discontinuity_probabilities() { - std::vector probabilities = { - 0.05, 0.25, 0.5, 0.75, 0.95, 1 - }; + std::vector probabilities = {0.05, 0.25, 0.5, 0.75, 0.95, 1}; return probabilities; } TYPED_TEST(RocprimDeviceSelectTests, UniqueEmptyInput) { - using T = typename TestFixture::input_type; + using T = typename TestFixture::input_type; const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream // Allocate and copy to device - unsigned int * d_selected_count_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::unique( - nullptr, - temp_storage_size_bytes, - rocprim::make_constant_iterator(123), - rocprim::make_discard_iterator(), - d_selected_count_output, - 0, - ::rocprim::equal_to(), - stream, - debug_synchronous - ) - ); - - void * d_temp_storage = nullptr; + HIP_CHECK(rocprim::unique(nullptr, + temp_storage_size_bytes, + rocprim::make_constant_iterator(123), + rocprim::make_discard_iterator(), + d_selected_count_output, + 0, + ::rocprim::equal_to(), + stream, + debug_synchronous)); + + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Run - HIP_CHECK( - rocprim::unique( - d_temp_storage, - temp_storage_size_bytes, - rocprim::make_constant_iterator(123), - rocprim::make_discard_iterator(), - d_selected_count_output, - 0, - ::rocprim::equal_to(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::unique(d_temp_storage, + temp_storage_size_bytes, + rocprim::make_constant_iterator(123), + rocprim::make_discard_iterator(), + d_selected_count_output, + 0, + ::rocprim::equal_to(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if number of selected value is 0 unsigned int selected_count_output = 0; - HIP_CHECK( - hipMemcpy( - &selected_count_output, d_selected_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); ASSERT_EQ(selected_count_output, 0); hipFree(d_selected_count_output); @@ -412,14 +351,14 @@ TYPED_TEST(RocprimDeviceSelectTests, UniqueEmptyInput) TYPED_TEST(RocprimDeviceSelectTests, Unique) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; - const bool debug_synchronous = TestFixture::debug_synchronous; + const bool debug_synchronous = TestFixture::debug_synchronous; hipStream_t stream = 0; // default stream - const auto sizes = get_sizes(); + const auto sizes = get_sizes(); const auto probabilities = get_discontinuity_probabilities(); for(auto size : sizes) { @@ -433,24 +372,18 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) { std::vector input01 = test_utils::get_random_data01(size, p); test_utils::host_inclusive_scan( - input01.begin(), input01.end(), input.begin(), rocprim::plus() - ); + input01.begin(), input01.end(), input.begin(), rocprim::plus()); } // Allocate and copy to device - T * d_input; - U * d_output; - unsigned int * d_selected_count_output; + T* d_input; + U* d_output; + unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(U))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host @@ -459,7 +392,7 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) expected.push_back(input[0]); for(size_t i = 1; i < input.size(); i++) { - if(!(input[i-1] == input[i])) + if(!(input[i - 1] == input[i])) { expected.push_back(input[i]); } @@ -468,66 +401,52 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::unique( - nullptr, - temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - ::rocprim::equal_to(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::unique( + nullptr, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + ::rocprim::equal_to(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::unique( - d_temp_storage, - temp_storage_size_bytes, - d_input, - test_utils::wrap_in_identity_iterator(d_output), - d_selected_count_output, - input.size(), - ::rocprim::equal_to(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::unique( + d_temp_storage, + temp_storage_size_bytes, + d_input, + test_utils::wrap_in_identity_iterator(d_output), + d_selected_count_output, + input.size(), + ::rocprim::equal_to(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if number of selected value is as expected unsigned int selected_count_output = 0; - HIP_CHECK( - hipMemcpy( - &selected_count_output, d_selected_count_output, - sizeof(unsigned int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(&selected_count_output, + d_selected_count_output, + sizeof(unsigned int), + hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_EQ(selected_count_output, expected.size()); // Check if output values are as expected std::vector output(input.size()); - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < expected.size(); i++) { diff --git a/test/rocprim/test_device_transform.cpp b/test/rocprim/test_device_transform.cpp index 34b890d09..f8a08e0e0 100644 --- a/test/rocprim/test_device_transform.cpp +++ b/test/rocprim/test_device_transform.cpp @@ -20,9 +20,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include #include #include -#include // Google Test #include @@ -33,21 +33,16 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; // Params for tests -template< - class InputType, - class OutputType = InputType, - bool UseIdentityIterator = false -> +template struct DeviceTransformParams { - using input_type = InputType; - using output_type = OutputType; + using input_type = InputType; + using output_type = OutputType; static constexpr bool use_identity_iterator = UseIdentityIterator; }; @@ -55,36 +50,31 @@ struct DeviceTransformParams // Test for reduce ops taking single input value // --------------------------------------------------------- -template +template class RocprimDeviceTransformTests : public ::testing::Test { public: - using input_type = typename Params::input_type; - using output_type = typename Params::output_type; + using input_type = typename Params::input_type; + using output_type = typename Params::output_type; static constexpr bool use_identity_iterator = Params::use_identity_iterator; - static constexpr bool debug_synchronous = false; + static constexpr bool debug_synchronous = false; }; -using custom_short2 = test_utils::custom_test_type; -using custom_int2 = test_utils::custom_test_type; +using custom_short2 = test_utils::custom_test_type; +using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; -typedef ::testing::Types< - DeviceTransformParams, - DeviceTransformParams, - DeviceTransformParams, - DeviceTransformParams, - DeviceTransformParams, - DeviceTransformParams -> RocprimDeviceTransformTestsParams; +typedef ::testing::Types, + DeviceTransformParams, + DeviceTransformParams, + DeviceTransformParams, + DeviceTransformParams, + DeviceTransformParams> + RocprimDeviceTransformTestsParams; std::vector get_sizes() { - std::vector sizes = { - 1, 10, 53, 211, - 1024, 2048, 5096, - 34567, (1 << 17) - 1220 - }; + std::vector sizes = {1, 10, 53, 211, 1024, 2048, 5096, 34567, (1 << 17) - 1220}; const std::vector random_sizes = test_utils::get_random_data(2, 1, 16384); sizes.insert(sizes.end(), random_sizes.begin(), random_sizes.end()); std::sort(sizes.begin(), sizes.end()); @@ -93,11 +83,10 @@ std::vector get_sizes() TYPED_TEST_CASE(RocprimDeviceTransformTests, RocprimDeviceTransformTestsParams); -template +template struct transform { - __device__ __host__ inline - constexpr T operator()(const T& a) const + __device__ __host__ inline constexpr T operator()(const T& a) const { return a + 5; } @@ -105,10 +94,10 @@ struct transform TYPED_TEST(RocprimDeviceTransformTests, Transform) { - using T = typename TestFixture::input_type; - using U = typename TestFixture::output_type; + using T = typename TestFixture::input_type; + using U = typename TestFixture::output_type; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; - const bool debug_synchronous = TestFixture::debug_synchronous; + const bool debug_synchronous = TestFixture::debug_synchronous; const std::vector sizes = get_sizes(); for(auto size : sizes) @@ -121,17 +110,12 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform) std::vector input = test_utils::get_random_data(size, 1, 100); std::vector output(input.size(), 0); - T * d_input; - U * d_output; + T* d_input; + U* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host @@ -139,24 +123,19 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform) std::transform(input.begin(), input.end(), expected.begin(), transform()); // Run - HIP_CHECK( - rocprim::transform( - d_input, - test_utils::wrap_in_identity_iterator(d_output), - input.size(), transform(), stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::transform( + d_input, + test_utils::wrap_in_identity_iterator(d_output), + input.size(), + transform(), + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected @@ -167,11 +146,10 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform) } } -template +template struct binary_transform { - __device__ __host__ inline - constexpr U operator()(const T1& a, const T2& b) const + __device__ __host__ inline constexpr U operator()(const T1& a, const T2& b) const { return a + b; } @@ -179,11 +157,11 @@ struct binary_transform TYPED_TEST(RocprimDeviceTransformTests, BinaryTransform) { - using T1 = typename TestFixture::input_type; - using T2 = typename TestFixture::input_type; - using U = typename TestFixture::output_type; + using T1 = typename TestFixture::input_type; + using T2 = typename TestFixture::input_type; + using U = typename TestFixture::output_type; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; - const bool debug_synchronous = TestFixture::debug_synchronous; + const bool debug_synchronous = TestFixture::debug_synchronous; const std::vector sizes = get_sizes(); for(auto size : sizes) @@ -195,56 +173,43 @@ TYPED_TEST(RocprimDeviceTransformTests, BinaryTransform) // Generate data std::vector input1 = test_utils::get_random_data(size, 1, 100); std::vector input2 = test_utils::get_random_data(size, 1, 100); - std::vector output(input1.size(), 0); + std::vector output(input1.size(), 0); - T1 * d_input1; - T2 * d_input2; - U * d_output; + T1* d_input1; + T2* d_input2; + U* d_output; HIP_CHECK(hipMalloc(&d_input1, input1.size() * sizeof(T1))); HIP_CHECK(hipMalloc(&d_input2, input2.size() * sizeof(T2))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input1, input1.data(), - input1.size() * sizeof(T1), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input1, input1.data(), input1.size() * sizeof(T1), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input2, input2.data(), - input2.size() * sizeof(T2), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input2, input2.data(), input2.size() * sizeof(T2), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host std::vector expected(input1.size()); - std::transform( - input1.begin(), input1.end(), input2.begin(), - expected.begin(), binary_transform() - ); + std::transform(input1.begin(), + input1.end(), + input2.begin(), + expected.begin(), + binary_transform()); // Run - HIP_CHECK( - rocprim::transform( - d_input1, d_input2, - test_utils::wrap_in_identity_iterator(d_output), - input1.size(), binary_transform(), stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::transform( + d_input1, + d_input2, + test_utils::wrap_in_identity_iterator(d_output), + input1.size(), + binary_transform(), + stream, + debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected diff --git a/test/rocprim/test_discard_iterator.cpp b/test/rocprim/test_discard_iterator.cpp index 2ea359bda..8996d6a2d 100644 --- a/test/rocprim/test_discard_iterator.cpp +++ b/test/rocprim/test_discard_iterator.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include +#include #include +#include // Google Test #include @@ -34,7 +34,7 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(error,hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(error, hipSuccess) TEST(RocprimDiscardIteratorTests, Equal) { @@ -78,81 +78,68 @@ TEST(RocprimDiscardIteratorTests, ReduceByKey) hipStream_t stream = 0; // default // host input - std::vector keys_input = { - 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0 - }; + std::vector keys_input = {0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0}; std::vector values_input(keys_input.size(), 1); // expected output - std::vector aggregates_expected = { 3, 2, 2, 4 }; + std::vector aggregates_expected = {3, 2, 2, 4}; // device input/output - int * d_keys_input; - int * d_values_input; + int* d_keys_input; + int* d_values_input; HIP_CHECK(hipMalloc(&d_keys_input, keys_input.size() * sizeof(int))); HIP_CHECK(hipMalloc(&d_values_input, values_input.size() * sizeof(int))); - HIP_CHECK( - hipMemcpy( - d_keys_input, keys_input.data(), - keys_input.size() * sizeof(int), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_values_input, values_input.data(), - values_input.size() * sizeof(int), - hipMemcpyHostToDevice - ) - ); - int * d_aggregates_output; + HIP_CHECK(hipMemcpy( + d_keys_input, keys_input.data(), keys_input.size() * sizeof(int), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_values_input, + values_input.data(), + values_input.size() * sizeof(int), + hipMemcpyHostToDevice)); + int* d_aggregates_output; HIP_CHECK(hipMalloc(&d_aggregates_output, aggregates_expected.size() * sizeof(int))); HIP_CHECK(hipDeviceSynchronize()); // Get temporary storage size size_t temporary_storage_bytes; - HIP_CHECK( - rocprim::reduce_by_key( - nullptr, temporary_storage_bytes, - d_keys_input, - d_values_input, values_input.size(), - rocprim::make_discard_iterator(), - d_aggregates_output, - rocprim::make_discard_iterator(), - rocprim::plus(), rocprim::equal_to(), - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce_by_key(nullptr, + temporary_storage_bytes, + d_keys_input, + d_values_input, + values_input.size(), + rocprim::make_discard_iterator(), + d_aggregates_output, + rocprim::make_discard_iterator(), + rocprim::plus(), + rocprim::equal_to(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_GT(temporary_storage_bytes, 0); - void * d_temporary_storage; + void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - HIP_CHECK( - rocprim::reduce_by_key( - d_temporary_storage, temporary_storage_bytes, - d_keys_input, - d_values_input, values_input.size(), - rocprim::make_discard_iterator(), - d_aggregates_output, - rocprim::make_discard_iterator(), - rocprim::plus(), rocprim::equal_to(), - stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce_by_key(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + values_input.size(), + rocprim::make_discard_iterator(), + d_aggregates_output, + rocprim::make_discard_iterator(), + rocprim::plus(), + rocprim::equal_to(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected std::vector aggregates_output(aggregates_expected.size()); - HIP_CHECK( - hipMemcpy( - aggregates_output.data(), d_aggregates_output, - aggregates_expected.size() * sizeof(int), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(aggregates_output.data(), + d_aggregates_output, + aggregates_expected.size() * sizeof(int), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < aggregates_output.size(); i++) { ASSERT_EQ(aggregates_output[i], aggregates_expected[i]); diff --git a/test/rocprim/test_intrinsics.cpp b/test/rocprim/test_intrinsics.cpp index 4c070ad20..2cb1bed76 100644 --- a/test/rocprim/test_intrinsics.cpp +++ b/test/rocprim/test_intrinsics.cpp @@ -20,9 +20,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include #include #include -#include // Google Test #include @@ -36,9 +36,9 @@ // Custom structure struct custom_notaligned { - short i; - double d; - float f; + short i; + double d; + float f; unsigned int u; ROCPRIM_HOST_DEVICE @@ -48,19 +48,17 @@ struct custom_notaligned }; ROCPRIM_HOST_DEVICE -inline bool operator==(const custom_notaligned& lhs, - const custom_notaligned& rhs) +inline bool operator==(const custom_notaligned& lhs, const custom_notaligned& rhs) { - return lhs.i == rhs.i && lhs.d == rhs.d - && lhs.f == rhs.f &&lhs.u == rhs.u; + return lhs.i == rhs.i && lhs.d == rhs.d && lhs.f == rhs.f && lhs.u == rhs.u; } // Custom structure aligned to 16 bytes struct custom_16aligned { - int i; + int i; unsigned int u; - float f; + float f; ROCPRIM_HOST_DEVICE custom_16aligned() {}; @@ -68,115 +66,96 @@ struct custom_16aligned ~custom_16aligned() {}; } __attribute__((aligned(16))); -inline ROCPRIM_HOST_DEVICE -bool operator==(const custom_16aligned& lhs, const custom_16aligned& rhs) +inline ROCPRIM_HOST_DEVICE bool operator==(const custom_16aligned& lhs, const custom_16aligned& rhs) { return lhs.i == rhs.i && lhs.f == rhs.f && lhs.u == rhs.u; } // Params for tests -template +template struct params { using type = T; }; -template +template class RocprimIntrinsicsTests : public ::testing::Test { public: using type = typename Params::type; }; -typedef ::testing::Types< - params, - params, - params, - params -> IntrinsicsTestParams; +typedef ::testing::Types, params, params, params> + IntrinsicsTestParams; TYPED_TEST_CASE(RocprimIntrinsicsTests, IntrinsicsTestParams); -template -__global__ -void shuffle_up_kernel(T* data, unsigned int delta, unsigned int width) +template +__global__ void shuffle_up_kernel(T* data, unsigned int delta, unsigned int width) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - T value = data[index]; - value = rocprim::warp_shuffle_up(value, delta, width); - data[index] = value; + T value = data[index]; + value = rocprim::warp_shuffle_up(value, delta, width); + data[index] = value; } TYPED_TEST(RocprimIntrinsicsTests, ShuffleUp) { - using T = typename TestFixture::type; + using T = typename TestFixture::type; const size_t hardware_warp_size = ::rocprim::warp_size(); - const size_t size = hardware_warp_size; + const size_t size = hardware_warp_size; // Generate input - auto input = test_utils::get_random_data(size, T(-100), T(100)); + auto input = test_utils::get_random_data(size, T(-100), T(100)); std::vector output(input.size()); T* device_data; - HIP_CHECK( - hipMalloc( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); - - for(unsigned int i = hardware_warp_size; i > 1; i = i/2) + HIP_CHECK(hipMalloc(&device_data, input.size() * sizeof(typename decltype(input)::value_type))); + + for(unsigned int i = hardware_warp_size; i > 1; i = i / 2) { const unsigned int logical_warp_size = i; SCOPED_TRACE(testing::Message() << "where logical_warp_size = " << i); auto deltas = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1) - ); + std::max(1, logical_warp_size - 1)); for(auto delta : deltas) { SCOPED_TRACE(testing::Message() << "where delta = " << delta); // Calculate expected results on host std::vector expected(size, 0); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { - size_t index = j + logical_warp_size * i; - auto up_index = j > delta-1 ? index-delta : index; + size_t index = j + logical_warp_size * i; + auto up_index = j > delta - 1 ? index - delta : index; expected[index] = input[up_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + device_data, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, delta, logical_warp_size - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + delta, + logical_warp_size); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), device_data, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -187,86 +166,72 @@ TYPED_TEST(RocprimIntrinsicsTests, ShuffleUp) hipFree(device_data); } -template -__global__ -void shuffle_down_kernel(T* data, unsigned int delta, unsigned int width) +template +__global__ void shuffle_down_kernel(T* data, unsigned int delta, unsigned int width) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - T value = data[index]; - value = rocprim::warp_shuffle_down(value, delta, width); - data[index] = value; + T value = data[index]; + value = rocprim::warp_shuffle_down(value, delta, width); + data[index] = value; } TYPED_TEST(RocprimIntrinsicsTests, ShuffleDown) { - using T = typename TestFixture::type; + using T = typename TestFixture::type; const size_t hardware_warp_size = ::rocprim::warp_size(); - const size_t size = hardware_warp_size; + const size_t size = hardware_warp_size; // Generate input - auto input = test_utils::get_random_data(size, T(-100), T(100)); + auto input = test_utils::get_random_data(size, T(-100), T(100)); std::vector output(input.size()); T* device_data; - HIP_CHECK( - hipMalloc( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); - - for(unsigned int i = hardware_warp_size; i > 1; i = i/2) + HIP_CHECK(hipMalloc(&device_data, input.size() * sizeof(typename decltype(input)::value_type))); + + for(unsigned int i = hardware_warp_size; i > 1; i = i / 2) { const unsigned int logical_warp_size = i; SCOPED_TRACE(testing::Message() << "where logical_warp_size = " << i); auto deltas = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1) - ); + std::max(1, logical_warp_size - 1)); for(auto delta : deltas) { SCOPED_TRACE(testing::Message() << "where delta = " << delta); // Calculate expected results on host std::vector expected(size, 0); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { - size_t index = j + logical_warp_size * i; - auto down_index = j+delta < logical_warp_size ? index+delta : index; - expected[index] = input[down_index]; + size_t index = j + logical_warp_size * i; + auto down_index = j + delta < logical_warp_size ? index + delta : index; + expected[index] = input[down_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + device_data, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_down_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, delta, logical_warp_size - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_down_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + delta, + logical_warp_size); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), device_data, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -277,99 +242,77 @@ TYPED_TEST(RocprimIntrinsicsTests, ShuffleDown) hipFree(device_data); } -template -__global__ -void shuffle_index_kernel(T* data, int* src_lanes, unsigned int width) +template +__global__ void shuffle_index_kernel(T* data, int* src_lanes, unsigned int width) { const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; - T value = data[index]; - value = rocprim::warp_shuffle( - value, src_lanes[hipThreadIdx_x/width], width - ); + T value = data[index]; + value = rocprim::warp_shuffle(value, src_lanes[hipThreadIdx_x / width], width); data[index] = value; } TYPED_TEST(RocprimIntrinsicsTests, ShuffleIndex) { - using T = typename TestFixture::type; + using T = typename TestFixture::type; const size_t hardware_warp_size = ::rocprim::warp_size(); - const size_t size = hardware_warp_size; + const size_t size = hardware_warp_size; // Generate input - auto input = test_utils::get_random_data(size, T(-100), T(100)); + auto input = test_utils::get_random_data(size, T(-100), T(100)); std::vector output(input.size()); - T* device_data; - int * device_src_lanes; - HIP_CHECK( - hipMalloc( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); - HIP_CHECK( - hipMalloc( - &device_src_lanes, - hardware_warp_size * sizeof(int) - ) - ); - - for(unsigned int i = hardware_warp_size; i > 1; i = i/2) + T* device_data; + int* device_src_lanes; + HIP_CHECK(hipMalloc(&device_data, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(hipMalloc(&device_src_lanes, hardware_warp_size * sizeof(int))); + + for(unsigned int i = hardware_warp_size; i > 1; i = i / 2) { const unsigned int logical_warp_size = i; SCOPED_TRACE(testing::Message() << "where logical_warp_size = " << i); auto src_lanes = test_utils::get_random_data( - hardware_warp_size/logical_warp_size, - 0, std::max(0, logical_warp_size-1) - ); + hardware_warp_size / logical_warp_size, 0, std::max(0, logical_warp_size - 1)); // Calculate expected results on host std::vector expected(size, 0); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { int src_lane = src_lanes[i]; for(size_t j = 0; j < logical_warp_size; j++) { size_t index = j + logical_warp_size * i; - if(src_lane >= int(logical_warp_size) || src_lane < 0) src_lane = index; + if(src_lane >= int(logical_warp_size) || src_lane < 0) + src_lane = index; expected[index] = input[src_lane + logical_warp_size * i]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(typename decltype(input)::value_type), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - device_src_lanes, src_lanes.data(), - src_lanes.size() * sizeof(typename decltype(src_lanes)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(device_data, + input.data(), + input.size() * sizeof(typename decltype(input)::value_type), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(device_src_lanes, + src_lanes.data(), + src_lanes.size() * sizeof(typename decltype(src_lanes)::value_type), + hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_index_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, device_src_lanes, logical_warp_size - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_index_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + device_src_lanes, + logical_warp_size); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), device_data, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -382,82 +325,69 @@ TYPED_TEST(RocprimIntrinsicsTests, ShuffleIndex) TEST(RocprimIntrinsicsTests, ShuffleUpCustomStruct) { - using T = custom_notaligned; + using T = custom_notaligned; const size_t hardware_warp_size = ::rocprim::warp_size(); - const size_t size = hardware_warp_size; + const size_t size = hardware_warp_size; // Generate input std::vector random_data = test_utils::get_random_data(4 * size, -100, 100); - std::vector input(size); - std::vector output(input.size()); - for(size_t i = 0; i < 4 * input.size(); i+=4) + std::vector input(size); + std::vector output(input.size()); + for(size_t i = 0; i < 4 * input.size(); i += 4) { - input[i/4].i = random_data[i]; - input[i/4].d = random_data[i+1]; - input[i/4].f = random_data[i+2]; - input[i/4].u = random_data[i+3]; + input[i / 4].i = random_data[i]; + input[i / 4].d = random_data[i + 1]; + input[i / 4].f = random_data[i + 2]; + input[i / 4].u = random_data[i + 3]; } T* device_data; - HIP_CHECK( - hipMalloc( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); - - for(unsigned int i = hardware_warp_size; i > 1; i = i/2) + HIP_CHECK(hipMalloc(&device_data, input.size() * sizeof(typename decltype(input)::value_type))); + + for(unsigned int i = hardware_warp_size; i > 1; i = i / 2) { const unsigned int logical_warp_size = i; SCOPED_TRACE(testing::Message() << "where logical_warp_size = " << i); auto deltas = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1) - ); + std::max(1, logical_warp_size - 1)); for(auto delta : deltas) { SCOPED_TRACE(testing::Message() << "where delta = " << delta); // Calculate expected results on host std::vector expected(size); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { - size_t index = j + logical_warp_size * i; - auto up_index = j > delta-1 ? index-delta : index; + size_t index = j + logical_warp_size * i; + auto up_index = j > delta - 1 ? index - delta : index; expected[index] = input[up_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + device_data, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, delta, logical_warp_size - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + delta, + logical_warp_size); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), device_data, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -470,81 +400,68 @@ TEST(RocprimIntrinsicsTests, ShuffleUpCustomStruct) TEST(RocprimIntrinsicsTests, ShuffleUpCustomAlignedStruct) { - using T = custom_16aligned; - const size_t hardware_warp_size = ::rocprim::warp_size(); - const size_t size = hardware_warp_size; + using T = custom_16aligned; + const size_t hardware_warp_size = ::rocprim::warp_size(); + const size_t size = hardware_warp_size; // Generate input std::vector random_data = test_utils::get_random_data(3 * size, -100, 100); - std::vector input(size); - std::vector output(input.size()); - for(size_t i = 0; i < 3 * input.size(); i+=3) + std::vector input(size); + std::vector output(input.size()); + for(size_t i = 0; i < 3 * input.size(); i += 3) { - input[i/3].i = random_data[i]; - input[i/3].u = random_data[i+1]; - input[i/3].f = random_data[i+2]; + input[i / 3].i = random_data[i]; + input[i / 3].u = random_data[i + 1]; + input[i / 3].f = random_data[i + 2]; } T* device_data; - HIP_CHECK( - hipMalloc( - &device_data, - input.size() * sizeof(typename decltype(input)::value_type) - ) - ); - - for(unsigned int i = hardware_warp_size; i > 1; i = i/2) + HIP_CHECK(hipMalloc(&device_data, input.size() * sizeof(typename decltype(input)::value_type))); + + for(unsigned int i = hardware_warp_size; i > 1; i = i / 2) { const unsigned int logical_warp_size = i; SCOPED_TRACE(testing::Message() << "where logical_warp_size = " << i); auto deltas = test_utils::get_random_data( - std::max(1, logical_warp_size/2), + std::max(1, logical_warp_size / 2), 1U, - std::max(1, logical_warp_size - 1) - ); + std::max(1, logical_warp_size - 1)); for(auto delta : deltas) { SCOPED_TRACE(testing::Message() << "where delta = " << delta); // Calculate expected results on host std::vector expected(size); - for(size_t i = 0; i < input.size()/logical_warp_size; i++) + for(size_t i = 0; i < input.size() / logical_warp_size; i++) { for(size_t j = 0; j < logical_warp_size; j++) { - size_t index = j + logical_warp_size * i; - auto up_index = j > delta-1 ? index-delta : index; + size_t index = j + logical_warp_size * i; + auto up_index = j > delta - 1 ? index - delta : index; expected[index] = input[up_index]; } } // Writing to device memory - HIP_CHECK( - hipMemcpy( - device_data, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy( + device_data, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(shuffle_up_kernel), - dim3(1), dim3(hardware_warp_size), 0, 0, - device_data, delta, logical_warp_size - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(shuffle_up_kernel), + dim3(1), + dim3(hardware_warp_size), + 0, + 0, + device_data, + delta, + logical_warp_size); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), device_data, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), device_data, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { diff --git a/test/rocprim/test_texture_cache_iterator.cpp b/test/rocprim/test_texture_cache_iterator.cpp index 1f55b8722..a49016062 100644 --- a/test/rocprim/test_texture_cache_iterator.cpp +++ b/test/rocprim/test_texture_cache_iterator.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include +#include #include +#include // Google Test #include @@ -34,41 +34,38 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) \ - ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) // Params for tests -template +template struct RocprimTextureCacheIteratorParams { using input_type = InputType; }; -template +template class RocprimTextureCacheIteratorTests : public ::testing::Test { public: - using input_type = typename Params::input_type; + using input_type = typename Params::input_type; const bool debug_synchronous = false; }; -typedef ::testing::Types< - RocprimTextureCacheIteratorParams, - RocprimTextureCacheIteratorParams, - RocprimTextureCacheIteratorParams, - RocprimTextureCacheIteratorParams, - RocprimTextureCacheIteratorParams, - RocprimTextureCacheIteratorParams>, - RocprimTextureCacheIteratorParams> -> RocprimTextureCacheIteratorTestsParams; +typedef ::testing::Types, + RocprimTextureCacheIteratorParams, + RocprimTextureCacheIteratorParams, + RocprimTextureCacheIteratorParams, + RocprimTextureCacheIteratorParams, + RocprimTextureCacheIteratorParams>, + RocprimTextureCacheIteratorParams>> + RocprimTextureCacheIteratorTestsParams; TYPED_TEST_CASE(RocprimTextureCacheIteratorTests, RocprimTextureCacheIteratorTestsParams); -template +template struct transform { - __device__ __host__ - constexpr T operator()(const T& a) const + __device__ __host__ constexpr T operator()(const T& a) const { return a + 5; } @@ -76,8 +73,8 @@ struct transform TYPED_TEST(RocprimTextureCacheIteratorTests, Transform) { - using T = typename TestFixture::input_type; - using Iterator = typename rocprim::texture_cache_iterator; + using T = typename TestFixture::input_type; + using Iterator = typename rocprim::texture_cache_iterator; const bool debug_synchronous = TestFixture::debug_synchronous; const size_t size = 1024; @@ -92,17 +89,11 @@ TYPED_TEST(RocprimTextureCacheIteratorTests, Transform) } std::vector output(size); - T * d_input; - T * d_output; + T* d_input; + T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(T))); - HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); Iterator x; @@ -110,31 +101,15 @@ TYPED_TEST(RocprimTextureCacheIteratorTests, Transform) // Calculate expected results on host std::vector expected(size); - std::transform( - input.begin(), - input.end(), - expected.begin(), - transform() - ); + std::transform(input.begin(), input.end(), expected.begin(), transform()); // Run - HIP_CHECK( - rocprim::transform( - x, d_output, size, - transform(), stream, debug_synchronous - ) - ); + HIP_CHECK(rocprim::transform(x, d_output, size, transform(), stream, debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), d_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Validating results diff --git a/test/rocprim/test_transform_iterator.cpp b/test/rocprim/test_transform_iterator.cpp index 1190f5f1a..8f906cece 100644 --- a/test/rocprim/test_transform_iterator.cpp +++ b/test/rocprim/test_transform_iterator.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include +#include #include +#include // Google Test #include @@ -34,9 +34,9 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) -template +template struct times_two { ROCPRIM_HOST_DEVICE @@ -46,7 +46,7 @@ struct times_two } }; -template +template struct plus_ten { ROCPRIM_HOST_DEVICE @@ -57,45 +57,39 @@ struct plus_ten }; // Params for tests -template< - class InputType, - class UnaryFunction = times_two, - class ValueType = InputType -> +template , class ValueType = InputType> struct RocprimTransformIteratorParams { - using input_type = InputType; - using value_type = ValueType; + using input_type = InputType; + using value_type = ValueType; using unary_function = UnaryFunction; }; -template +template class RocprimTransformIteratorTests : public ::testing::Test { public: - using input_type = typename Params::input_type; - using value_type = typename Params::value_type; - using unary_function = typename Params::unary_function; + using input_type = typename Params::input_type; + using value_type = typename Params::value_type; + using unary_function = typename Params::unary_function; const bool debug_synchronous = false; }; -typedef ::testing::Types< - RocprimTransformIteratorParams>, - RocprimTransformIteratorParams, - RocprimTransformIteratorParams, - RocprimTransformIteratorParams, double> -> RocprimTransformIteratorTestsParams; +typedef ::testing::Types>, + RocprimTransformIteratorParams, + RocprimTransformIteratorParams, + RocprimTransformIteratorParams, double>> + RocprimTransformIteratorTestsParams; TYPED_TEST_CASE(RocprimTransformIteratorTests, RocprimTransformIteratorTestsParams); TYPED_TEST(RocprimTransformIteratorTests, TransformReduce) { - using input_type = typename TestFixture::input_type; - using value_type = typename TestFixture::value_type; + using input_type = typename TestFixture::input_type; + using value_type = typename TestFixture::value_type; using unary_function = typename TestFixture::unary_function; - using iterator_type = typename rocprim::transform_iterator< - input_type*, unary_function, value_type - >; + using iterator_type = + typename rocprim::transform_iterator; hipStream_t stream = 0; // default @@ -104,75 +98,57 @@ TYPED_TEST(RocprimTransformIteratorTests, TransformReduce) std::vector input = test_utils::get_random_data(size, 1, 200); std::vector output(1); - input_type * d_input; - value_type * d_output; + input_type* d_input; + value_type* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input_type))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(value_type))); HIP_CHECK( - hipMemcpy( - d_input, input.data(), - input.size() * sizeof(input_type), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input, input.data(), input.size() * sizeof(input_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - auto reduce_op = rocprim::plus(); + auto reduce_op = rocprim::plus(); unary_function transform; // Calculate expected results on host iterator_type x(input.data(), transform); - value_type expected = std::accumulate(x, x + size, value_type(0), reduce_op); + value_type expected = std::accumulate(x, x + size, value_type(0), reduce_op); auto d_iter = iterator_type(d_input, transform); // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::reduce( - nullptr, - temp_storage_size_bytes, - d_iter, - d_output, - value_type(0), - input.size(), - reduce_op, - stream - ) - ); + HIP_CHECK(rocprim::reduce(nullptr, + temp_storage_size_bytes, + d_iter, + d_output, + value_type(0), + input.size(), + reduce_op, + stream)); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Run - HIP_CHECK( - rocprim::reduce( - d_temp_storage, - temp_storage_size_bytes, - d_iter, - d_output, - value_type(0), - input.size(), - reduce_op, - stream, - TestFixture::debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce(d_temp_storage, + temp_storage_size_bytes, + d_iter, + d_output, + value_type(0), + input.size(), + reduce_op, + stream, + TestFixture::debug_synchronous)); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy( + output.data(), d_output, output.size() * sizeof(value_type), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected diff --git a/test/rocprim/test_utils.hpp b/test/rocprim/test_utils.hpp index e1ab5b51f..092271d9b 100644 --- a/test/rocprim/test_utils.hpp +++ b/test/rocprim/test_utils.hpp @@ -22,10 +22,10 @@ #define TEST_TEST_UTILS_HPP_ #include -#include +#include #include #include -#include +#include // Google Test #include @@ -39,8 +39,7 @@ #include "bounds_checking_iterator.hpp" // For better Google Test reporting and debug output of half values -inline -std::ostream& operator<<(std::ostream& stream, const rocprim::half& value) +inline std::ostream& operator<<(std::ostream& stream, const rocprim::half& value) { stream << static_cast(value); return stream; @@ -49,743 +48,742 @@ std::ostream& operator<<(std::ostream& stream, const rocprim::half& value) namespace test_utils { -// Support half operators on host side - -ROCPRIM_HOST inline -_Float16 half_to_native(const rocprim::half& x) -{ - return *reinterpret_cast(&x); -} - -ROCPRIM_HOST inline -rocprim::half native_to_half(const _Float16& x) -{ - return *reinterpret_cast(&x); -} + // Support half operators on host side -struct half_less -{ - ROCPRIM_HOST_DEVICE inline - bool operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a < b; - #else - return half_to_native(a) < half_to_native(b); - #endif + ROCPRIM_HOST inline _Float16 half_to_native(const rocprim::half& x) + { + return *reinterpret_cast(&x); } -}; -struct half_less_equal -{ - ROCPRIM_HOST_DEVICE inline - bool operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a <= b; - #else - return half_to_native(a) <= half_to_native(b); - #endif + ROCPRIM_HOST inline rocprim::half native_to_half(const _Float16& x) + { + return *reinterpret_cast(&x); } -}; -struct half_greater -{ - ROCPRIM_HOST_DEVICE inline - bool operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a > b; - #else - return half_to_native(a) > half_to_native(b); - #endif - } -}; + struct half_less + { + ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a < b; +#else + return half_to_native(a) < half_to_native(b); +#endif + } + }; -struct half_greater_equal -{ - ROCPRIM_HOST_DEVICE inline - bool operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a >= b; - #else - return half_to_native(a) >= half_to_native(b); - #endif - } -}; + struct half_less_equal + { + ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a <= b; +#else + return half_to_native(a) <= half_to_native(b); +#endif + } + }; -struct half_equal_to -{ - ROCPRIM_HOST_DEVICE inline - bool operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a == b; - #else - return half_to_native(a) == half_to_native(b); - #endif - } -}; + struct half_greater + { + ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a > b; +#else + return half_to_native(a) > half_to_native(b); +#endif + } + }; -struct half_not_equal_to -{ - ROCPRIM_HOST_DEVICE inline - bool operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a != b; - #else - return half_to_native(a) != half_to_native(b); - #endif - } -}; + struct half_greater_equal + { + ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a >= b; +#else + return half_to_native(a) >= half_to_native(b); +#endif + } + }; -struct half_plus -{ - ROCPRIM_HOST_DEVICE inline - rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a + b; - #else - return native_to_half(half_to_native(a) + half_to_native(b)); - #endif - } -}; + struct half_equal_to + { + ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a == b; +#else + return half_to_native(a) == half_to_native(b); +#endif + } + }; -struct half_minus -{ - ROCPRIM_HOST_DEVICE inline - rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a - b; - #else - return native_to_half(half_to_native(a) - half_to_native(b)); - #endif - } -}; + struct half_not_equal_to + { + ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a != b; +#else + return half_to_native(a) != half_to_native(b); +#endif + } + }; -struct half_multiplies -{ - ROCPRIM_HOST_DEVICE inline - rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a * b; - #else - return native_to_half(half_to_native(a) * half_to_native(b)); - #endif - } -}; + struct half_plus + { + ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a + b; +#else + return native_to_half(half_to_native(a) + half_to_native(b)); +#endif + } + }; -struct half_maximum -{ - ROCPRIM_HOST_DEVICE inline - rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a < b ? b : a; - #else - return half_to_native(a) < half_to_native(b) ? b : a; - #endif - } -}; + struct half_minus + { + ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a - b; +#else + return native_to_half(half_to_native(a) - half_to_native(b)); +#endif + } + }; -struct half_minimum -{ - ROCPRIM_HOST_DEVICE inline - rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const - { - #if __HIP_DEVICE_COMPILE__ - return a < b ? a : b; - #else - return half_to_native(a) < half_to_native(b) ? a : b; - #endif - } -}; + struct half_multiplies + { + ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a * b; +#else + return native_to_half(half_to_native(a) * half_to_native(b)); +#endif + } + }; -template -inline auto get_random_data(size_t size, T min, T max) - -> typename std::enable_if::value, std::vector>::type -{ - std::random_device rd; - std::default_random_engine gen(rd()); - std::uniform_int_distribution distribution(min, max); - std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); - return data; -} + struct half_maximum + { + ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a < b ? b : a; +#else + return half_to_native(a) < half_to_native(b) ? b : a; +#endif + } + }; -template -inline auto get_random_data(size_t size, T min, T max) - -> typename std::enable_if::value, std::vector>::type -{ - std::random_device rd; - std::default_random_engine gen(rd()); - // Generate floats when T is half - using dis_type = typename std::conditional::value, float, T>::type; - std::uniform_real_distribution distribution(min, max); - std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); - return data; -} + struct half_minimum + { + ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, + const rocprim::half& b) const + { +#if __HIP_DEVICE_COMPILE__ + return a < b ? a : b; +#else + return half_to_native(a) < half_to_native(b) ? a : b; +#endif + } + }; -template -inline std::vector get_random_data01(size_t size, float p) -{ - const size_t max_random_size = 1024 * 1024; - std::random_device rd; - std::default_random_engine gen(rd()); - std::bernoulli_distribution distribution(p); - std::vector data(size); - std::generate( - data.begin(), data.begin() + std::min(size, max_random_size), - [&]() { return distribution(gen); } - ); - for(size_t i = max_random_size; i < size; i += max_random_size) - { - std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + template + inline auto get_random_data(size_t size, T min, T max) -> + typename std::enable_if::value, std::vector>::type + { + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_int_distribution distribution(min, max); + std::vector data(size); + std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); + return data; + } + + template + inline auto get_random_data(size_t size, T min, T max) -> + typename std::enable_if::value, std::vector>::type + { + std::random_device rd; + std::default_random_engine gen(rd()); + // Generate floats when T is half + using dis_type = + typename std::conditional::value, float, T>::type; + std::uniform_real_distribution distribution(min, max); + std::vector data(size); + std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); + return data; + } + + template + inline std::vector get_random_data01(size_t size, float p) + { + const size_t max_random_size = 1024 * 1024; + std::random_device rd; + std::default_random_engine gen(rd()); + std::bernoulli_distribution distribution(p); + std::vector data(size); + std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { + return distribution(gen); + }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); + } + return data; } - return data; -} - -template -inline auto get_random_value(T min, T max) - -> typename std::enable_if::value, T>::type -{ - return get_random_data(1, min, max)[0]; -} - -// Can't use std::prefix_sum for inclusive/exclusive scan, because -// it does not handle short[] -> int(int a, int b) { a + b; } -> int[] -// they way we expect. That's because sum in std::prefix_sum's implementation -// is of type typename std::iterator_traits::value_type (short) -template -OutputIt host_inclusive_scan(InputIt first, InputIt last, - OutputIt d_first, BinaryOperation op) -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryOperation - >::type; - - if (first == last) return d_first; - - result_type sum = *first; - *d_first = sum; - while (++first != last) { - sum = op(sum, *first); - *++d_first = sum; + template + inline auto get_random_value(T min, T max) -> + typename std::enable_if::value, T>::type + { + return get_random_data(1, min, max)[0]; } - return ++d_first; -} -template -OutputIt host_exclusive_scan(InputIt first, InputIt last, - T initial_value, OutputIt d_first, - BinaryOperation op) -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryOperation - >::type; + // Can't use std::prefix_sum for inclusive/exclusive scan, because + // it does not handle short[] -> int(int a, int b) { a + b; } -> int[] + // they way we expect. That's because sum in std::prefix_sum's implementation + // is of type typename std::iterator_traits::value_type (short) + template + OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op) + { + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; - if (first == last) return d_first; + if(first == last) + return d_first; - result_type sum = initial_value; - *d_first = initial_value; + result_type sum = *first; + *d_first = sum; - while ((first+1) != last) - { - sum = op(sum, *first); - *++d_first = sum; - first++; + while(++first != last) + { + sum = op(sum, *first); + *++d_first = sum; + } + return ++d_first; } - return ++d_first; -} -template -OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, - T initial_value, OutputIt d_first, - BinaryOperation op, KeyCompare key_compare_op) -{ - using input_type = typename std::iterator_traits::value_type; - using result_type = typename ::rocprim::detail::match_result_type< - input_type, BinaryOperation - >::type; + template + OutputIt host_exclusive_scan( + InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op) + { + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; - if (first == last) return d_first; + if(first == last) + return d_first; - result_type sum = initial_value; - *d_first = initial_value; + result_type sum = initial_value; + *d_first = initial_value; - while ((first+1) != last) - { - if(key_compare_op(*k_first, *++k_first)) + while((first + 1) != last) { - sum = op(sum, *first); + sum = op(sum, *first); + *++d_first = sum; + first++; } - else + return ++d_first; + } + + template + OutputIt host_exclusive_scan_by_key(InputIt first, + InputIt last, + KeyIt k_first, + T initial_value, + OutputIt d_first, + BinaryOperation op, + KeyCompare key_compare_op) + { + using input_type = typename std::iterator_traits::value_type; + using result_type = + typename ::rocprim::detail::match_result_type::type; + + if(first == last) + return d_first; + + result_type sum = initial_value; + *d_first = initial_value; + + while((first + 1) != last) { - sum = initial_value; + if(key_compare_op(*k_first, *++k_first)) + { + sum = op(sum, *first); + } + else + { + sum = initial_value; + } + *++d_first = sum; + first++; } - *++d_first = sum; - first++; + return ++d_first; } - return ++d_first; -} -inline -size_t get_max_block_size() -{ - hipDeviceProp_t device_properties; - hipError_t error = hipGetDeviceProperties(&device_properties, 0); - if(error != hipSuccess) - { - std::cout << "HIP error: " << error - << " file: " << __FILE__ - << " line: " << __LINE__ - << std::endl; - std::exit(error); + inline size_t get_max_block_size() + { + hipDeviceProp_t device_properties; + hipError_t error = hipGetDeviceProperties(&device_properties, 0); + if(error != hipSuccess) + { + std::cout << "HIP error: " << error << " file: " << __FILE__ << " line: " << __LINE__ + << std::endl; + std::exit(error); + } + return device_properties.maxThreadsPerBlock; } - return device_properties.maxThreadsPerBlock; -} -template -struct is_custom_test_type : std::false_type -{ -}; + template + struct is_custom_test_type : std::false_type + { + }; -template -struct is_custom_test_array_type : std::false_type -{ -}; + template + struct is_custom_test_array_type : std::false_type + { + }; -template -struct inner_type -{ - using type = T; -}; + template + struct inner_type + { + using type = T; + }; -// Custom type used in tests -template -struct custom_test_type -{ - using value_type = T; + // Custom type used in tests + template + struct custom_test_type + { + using value_type = T; - T x; - T y; + T x; + T y; - // Non-zero values in default constructor for checking reduce and scan: - // ensure that scan_op(custom_test_type(), value) != value - ROCPRIM_HOST_DEVICE inline - custom_test_type() : x(12), y(34) {} + // Non-zero values in default constructor for checking reduce and scan: + // ensure that scan_op(custom_test_type(), value) != value + ROCPRIM_HOST_DEVICE inline custom_test_type() + : x(12) + , y(34) + { + } - ROCPRIM_HOST_DEVICE inline - custom_test_type(T x, T y) : x(x), y(y) {} + ROCPRIM_HOST_DEVICE inline custom_test_type(T x, T y) + : x(x) + , y(y) + { + } - ROCPRIM_HOST_DEVICE inline - custom_test_type(T xy) : x(xy), y(xy) {} + ROCPRIM_HOST_DEVICE inline custom_test_type(T xy) + : x(xy) + , y(xy) + { + } - template - ROCPRIM_HOST_DEVICE inline - custom_test_type(const custom_test_type& other) - { - x = other.x; - y = other.y; - } + template + ROCPRIM_HOST_DEVICE inline custom_test_type(const custom_test_type& other) + { + x = other.x; + y = other.y; + } - ROCPRIM_HOST_DEVICE inline - ~custom_test_type() {} + ROCPRIM_HOST_DEVICE inline ~custom_test_type() {} - ROCPRIM_HOST_DEVICE inline - custom_test_type& operator=(const custom_test_type& other) - { - x = other.x; - y = other.y; - return *this; - } + ROCPRIM_HOST_DEVICE inline custom_test_type& operator=(const custom_test_type& other) + { + x = other.x; + y = other.y; + return *this; + } - ROCPRIM_HOST_DEVICE inline - custom_test_type operator+(const custom_test_type& other) const - { - return custom_test_type(x + other.x, y + other.y); - } + ROCPRIM_HOST_DEVICE inline custom_test_type operator+(const custom_test_type& other) const + { + return custom_test_type(x + other.x, y + other.y); + } - ROCPRIM_HOST_DEVICE inline - custom_test_type operator-(const custom_test_type& other) const - { - return custom_test_type(x - other.x, y - other.y); - } + ROCPRIM_HOST_DEVICE inline custom_test_type operator-(const custom_test_type& other) const + { + return custom_test_type(x - other.x, y - other.y); + } - ROCPRIM_HOST_DEVICE inline - bool operator<(const custom_test_type& other) const - { - return (x < other.x || (x == other.x && y < other.y)); - } + ROCPRIM_HOST_DEVICE inline bool operator<(const custom_test_type& other) const + { + return (x < other.x || (x == other.x && y < other.y)); + } - ROCPRIM_HOST_DEVICE inline - bool operator>(const custom_test_type& other) const - { - return (x > other.x || (x == other.x && y > other.y)); - } + ROCPRIM_HOST_DEVICE inline bool operator>(const custom_test_type& other) const + { + return (x > other.x || (x == other.x && y > other.y)); + } - ROCPRIM_HOST_DEVICE inline - bool operator==(const custom_test_type& other) const - { - return (x == other.x && y == other.y); - } + ROCPRIM_HOST_DEVICE inline bool operator==(const custom_test_type& other) const + { + return (x == other.x && y == other.y); + } - ROCPRIM_HOST_DEVICE inline - bool operator!=(const custom_test_type& other) const - { - return !(*this == other); - } -}; + ROCPRIM_HOST_DEVICE inline bool operator!=(const custom_test_type& other) const + { + return !(*this == other); + } + }; -// Custom type used in tests -template -struct custom_test_array_type -{ - using value_type = T; - static constexpr size_t size = N; + // Custom type used in tests + template + struct custom_test_array_type + { + using value_type = T; + static constexpr size_t size = N; - T values[N]; + T values[N]; - ROCPRIM_HOST_DEVICE inline - custom_test_array_type() - { - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline custom_test_array_type() { - values[i] = T(i + 1); + for(size_t i = 0; i < N; i++) + { + values[i] = T(i + 1); + } } - } - ROCPRIM_HOST_DEVICE inline - custom_test_array_type(T v) - { - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline custom_test_array_type(T v) { - values[i] = v; + for(size_t i = 0; i < N; i++) + { + values[i] = v; + } } - } - template - ROCPRIM_HOST_DEVICE inline - custom_test_array_type(const custom_test_array_type& other) - { - for(size_t i = 0; i < N; i++) + template + ROCPRIM_HOST_DEVICE inline custom_test_array_type(const custom_test_array_type& other) { - values[i] = other.values[i]; + for(size_t i = 0; i < N; i++) + { + values[i] = other.values[i]; + } } - } - ROCPRIM_HOST_DEVICE inline - ~custom_test_array_type() {} + ROCPRIM_HOST_DEVICE inline ~custom_test_array_type() {} - ROCPRIM_HOST_DEVICE inline - custom_test_array_type& operator=(const custom_test_array_type& other) - { - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline custom_test_array_type& + operator=(const custom_test_array_type& other) { - values[i] = other.values[i]; + for(size_t i = 0; i < N; i++) + { + values[i] = other.values[i]; + } + return *this; } - return *this; - } - ROCPRIM_HOST_DEVICE inline - custom_test_array_type operator+(const custom_test_array_type& other) const - { - custom_test_array_type result; - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline custom_test_array_type + operator+(const custom_test_array_type& other) const { - result.values[i] = values[i] + other.values[i]; + custom_test_array_type result; + for(size_t i = 0; i < N; i++) + { + result.values[i] = values[i] + other.values[i]; + } + return result; } - return result; - } - ROCPRIM_HOST_DEVICE inline - custom_test_array_type operator-(const custom_test_array_type& other) const - { - custom_test_array_type result; - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline custom_test_array_type + operator-(const custom_test_array_type& other) const { - result.values[i] = values[i] - other.values[i]; + custom_test_array_type result; + for(size_t i = 0; i < N; i++) + { + result.values[i] = values[i] - other.values[i]; + } + return result; } - return result; - } - ROCPRIM_HOST_DEVICE inline - bool operator<(const custom_test_array_type& other) const - { - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline bool operator<(const custom_test_array_type& other) const { - if(values[i] >= other.values[i]) + for(size_t i = 0; i < N; i++) { - return false; + if(values[i] >= other.values[i]) + { + return false; + } } + return true; } - return true; - } - ROCPRIM_HOST_DEVICE inline - bool operator>(const custom_test_array_type& other) const - { - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline bool operator>(const custom_test_array_type& other) const { - if(values[i] <= other.values[i]) + for(size_t i = 0; i < N; i++) { - return false; + if(values[i] <= other.values[i]) + { + return false; + } } + return true; } - return true; - } - ROCPRIM_HOST_DEVICE inline - bool operator==(const custom_test_array_type& other) const - { - for(size_t i = 0; i < N; i++) + ROCPRIM_HOST_DEVICE inline bool operator==(const custom_test_array_type& other) const { - if(values[i] != other.values[i]) + for(size_t i = 0; i < N; i++) { - return false; + if(values[i] != other.values[i]) + { + return false; + } } + return true; } - return true; - } - ROCPRIM_HOST_DEVICE inline - bool operator!=(const custom_test_array_type& other) const + ROCPRIM_HOST_DEVICE inline bool operator!=(const custom_test_array_type& other) const + { + return !(*this == other); + } + }; + + template + inline std::ostream& operator<<(std::ostream& stream, const custom_test_type& value) { - return !(*this == other); + stream << "[" << value.x << "; " << value.y << "]"; + return stream; } -}; - -template inline -std::ostream& operator<<(std::ostream& stream, - const custom_test_type& value) -{ - stream << "[" << value.x << "; " << value.y << "]"; - return stream; -} -template inline -std::ostream& operator<<(std::ostream& stream, - const custom_test_array_type& value) -{ - stream << "["; - for(size_t i = 0; i < N; i++) + template + inline std::ostream& operator<<(std::ostream& stream, const custom_test_array_type& value) { - stream << value.values[i]; - if(i != N - 1) + stream << "["; + for(size_t i = 0; i < N; i++) { - stream << "; "; + stream << value.values[i]; + if(i != N - 1) + { + stream << "; "; + } } + stream << "]"; + return stream; } - stream << "]"; - return stream; -} - -template -struct is_custom_test_type> : std::true_type -{ -}; - -template -struct is_custom_test_array_type> : std::true_type -{ -}; + template + struct is_custom_test_type> : std::true_type + { + }; -template -struct inner_type> -{ - using type = T; -}; + template + struct is_custom_test_array_type> : std::true_type + { + }; -template -struct inner_type> -{ - using type = T; -}; + template + struct inner_type> + { + using type = T; + }; -namespace detail -{ - template - struct numeric_limits_custom_test_type : public std::numeric_limits + template + struct inner_type> { + using type = T; }; -} -// Numeric limits which also supports custom_test_type classes -template -struct numeric_limits : public std::conditional< - is_custom_test_type::value, - detail::numeric_limits_custom_test_type, - std::numeric_limits - >::type -{ -}; - -template -inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max) - -> typename std::enable_if< - is_custom_test_type::value && std::is_integral::value, - std::vector - >::type -{ - std::random_device rd; - std::default_random_engine gen(rd()); - std::uniform_int_distribution distribution(min, max); - std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); - return data; -} + namespace detail + { + template + struct numeric_limits_custom_test_type : public std::numeric_limits + { + }; + } -template -inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max) - -> typename std::enable_if< - is_custom_test_type::value && std::is_floating_point::value, - std::vector - >::type -{ - std::random_device rd; - std::default_random_engine gen(rd()); - std::uniform_real_distribution distribution(min, max); - std::vector data(size); - std::generate(data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); - return data; -} + // Numeric limits which also supports custom_test_type classes + template + struct numeric_limits : public std::conditional::value, + detail::numeric_limits_custom_test_type, + std::numeric_limits>::type + { + }; -template -inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max) - -> typename std::enable_if< - is_custom_test_array_type::value && std::is_integral::value, - std::vector - >::type -{ - std::random_device rd; - std::default_random_engine gen(rd()); - std::uniform_int_distribution distribution(min, max); - std::vector data(size); - std::generate( - data.begin(), data.end(), - [&]() - { + template + inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max) + -> typename std::enable_if::value + && std::is_integral::value, + std::vector>::type + { + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_int_distribution distribution(min, max); + std::vector data(size); + std::generate( + data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); + return data; + } + + template + inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max) + -> typename std::enable_if::value + && std::is_floating_point::value, + std::vector>::type + { + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_real_distribution distribution(min, max); + std::vector data(size); + std::generate( + data.begin(), data.end(), [&]() { return T(distribution(gen), distribution(gen)); }); + return data; + } + + template + inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max) + -> typename std::enable_if::value + && std::is_integral::value, + std::vector>::type + { + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_int_distribution distribution(min, max); + std::vector data(size); + std::generate(data.begin(), data.end(), [&]() { T result; for(size_t i = 0; i < T::size; i++) { result.values[i] = distribution(gen); } return result; - } - ); - return data; -} - -template -inline auto get_random_value(typename T::value_type min, typename T::value_type max) - -> typename std::enable_if::value || is_custom_test_array_type::value, T>::type -{ - return get_random_data(1, min, max)[0]; -} + }); + return data; + } -template -auto assert_near(const std::vector& result, const std::vector& expected, const float percent) - -> typename std::enable_if::value>::type -{ - ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < result.size(); i++) + template + inline auto get_random_value(typename T::value_type min, typename T::value_type max) -> + typename std:: + enable_if::value || is_custom_test_array_type::value, T>::type { - auto diff = std::max(std::abs(percent * expected[i]), T(percent)); - ASSERT_NEAR(result[i], expected[i], diff) << "where index = " << i; + return get_random_data(1, min, max)[0]; } -} -template -auto assert_near(const std::vector& result, const std::vector& expected, const float percent) - -> typename std::enable_if::value>::type -{ - (void)percent; - ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < result.size(); i++) + template + auto assert_near(const std::vector& result, + const std::vector& expected, + const float percent) -> + typename std::enable_if::value>::type { - ASSERT_EQ(result[i], expected[i]) << "where index = " << i; + ASSERT_EQ(result.size(), expected.size()); + for(size_t i = 0; i < result.size(); i++) + { + auto diff = std::max(std::abs(percent * expected[i]), T(percent)); + ASSERT_NEAR(result[i], expected[i], diff) << "where index = " << i; + } } -} -void assert_near(const std::vector& result, const std::vector& expected, float percent) -{ - ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < result.size(); i++) + template + auto assert_near(const std::vector& result, + const std::vector& expected, + const float percent) -> + typename std::enable_if::value>::type { - auto diff = std::max(std::abs(percent * static_cast(expected[i])), percent); - ASSERT_NEAR(static_cast(result[i]), static_cast(expected[i]), diff) << "where index = " << i; + (void)percent; + ASSERT_EQ(result.size(), expected.size()); + for(size_t i = 0; i < result.size(); i++) + { + ASSERT_EQ(result[i], expected[i]) << "where index = " << i; + } } -} -template -auto assert_near(const std::vector>& result, const std::vector>& expected, const float percent) - -> typename std::enable_if::value>::type -{ - ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < result.size(); i++) + void assert_near(const std::vector& result, + const std::vector& expected, + float percent) { - auto diff1 = std::max(std::abs(percent * expected[i].x), T(percent)); - auto diff2 = std::max(std::abs(percent * expected[i].y), T(percent)); - ASSERT_NEAR(result[i].x, expected[i].x, diff1) << "where index = " << i; - ASSERT_NEAR(result[i].y, expected[i].y, diff2) << "where index = " << i; + ASSERT_EQ(result.size(), expected.size()); + for(size_t i = 0; i < result.size(); i++) + { + auto diff + = std::max(std::abs(percent * static_cast(expected[i])), percent); + ASSERT_NEAR(static_cast(result[i]), static_cast(expected[i]), diff) + << "where index = " << i; + } } -} -template -auto assert_near(const T& result, const T& expected, const float percent) - -> typename std::enable_if::value>::type -{ - auto diff = std::max(std::abs(percent * expected), T(percent)); - ASSERT_NEAR(result, expected, diff); -} + template + auto assert_near(const std::vector>& result, + const std::vector>& expected, + const float percent) -> + typename std::enable_if::value>::type + { + ASSERT_EQ(result.size(), expected.size()); + for(size_t i = 0; i < result.size(); i++) + { + auto diff1 = std::max(std::abs(percent * expected[i].x), T(percent)); + auto diff2 = std::max(std::abs(percent * expected[i].y), T(percent)); + ASSERT_NEAR(result[i].x, expected[i].x, diff1) << "where index = " << i; + ASSERT_NEAR(result[i].y, expected[i].y, diff2) << "where index = " << i; + } + } -template -auto assert_near(const T& result, const T& expected, const float percent) - -> typename std::enable_if::value>::type -{ - (void)percent; - ASSERT_EQ(result, expected); -} + template + auto assert_near(const T& result, const T& expected, const float percent) -> + typename std::enable_if::value>::type + { + auto diff = std::max(std::abs(percent * expected), T(percent)); + ASSERT_NEAR(result, expected, diff); + } + template + auto assert_near(const T& result, const T& expected, const float percent) -> + typename std::enable_if::value>::type + { + (void)percent; + ASSERT_EQ(result, expected); + } -template -auto assert_near(const custom_test_type& result, const custom_test_type& expected, const float percent) - -> typename std::enable_if::value>::type -{ - auto diff1 = std::max(std::abs(percent * expected.x), T(percent)); - auto diff2 = std::max(std::abs(percent * expected.y), T(percent)); - ASSERT_NEAR(result.x, expected.x, diff1); - ASSERT_NEAR(result.y, expected.y, diff2); -} + template + auto assert_near(const custom_test_type& result, + const custom_test_type& expected, + const float percent) -> + typename std::enable_if::value>::type + { + auto diff1 = std::max(std::abs(percent * expected.x), T(percent)); + auto diff2 = std::max(std::abs(percent * expected.y), T(percent)); + ASSERT_NEAR(result.x, expected.x, diff1); + ASSERT_NEAR(result.y, expected.y, diff2); + } -template -void assert_eq(const std::vector& result, const std::vector& expected) -{ - ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < result.size(); i++) + template + void assert_eq(const std::vector& result, const std::vector& expected) { - ASSERT_EQ(result[i], expected[i]) << "where index = " << i; + ASSERT_EQ(result.size(), expected.size()); + for(size_t i = 0; i < result.size(); i++) + { + ASSERT_EQ(result[i], expected[i]) << "where index = " << i; + } } -} -void assert_eq(const std::vector& result, const std::vector& expected) -{ - ASSERT_EQ(result.size(), expected.size()); - for(size_t i = 0; i < result.size(); i++) + void assert_eq(const std::vector& result, + const std::vector& expected) { - ASSERT_EQ(half_to_native(result[i]), half_to_native(expected[i])) << "where index = " << i; + ASSERT_EQ(result.size(), expected.size()); + for(size_t i = 0; i < result.size(); i++) + { + ASSERT_EQ(half_to_native(result[i]), half_to_native(expected[i])) + << "where index = " << i; + } } -} } // end test_utils namespace diff --git a/test/rocprim/test_warp_reduce.cpp b/test/rocprim/test_warp_reduce.cpp index 0b144d79b..ceb02e38a 100644 --- a/test/rocprim/test_warp_reduce.cpp +++ b/test/rocprim/test_warp_reduce.cpp @@ -20,10 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include #include #include +#include +#include // Google Test #include @@ -32,27 +32,24 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; -template< - class T, - unsigned int WarpSize -> +template struct params { - using type = T; + using type = T; static constexpr unsigned int warp_size = WarpSize; }; -template -class RocprimWarpReduceTests : public ::testing::Test { +template +class RocprimWarpReduceTests : public ::testing::Test +{ public: using params = Params; }; - typedef ::testing::Types< // shuffle based reduce params, @@ -77,22 +74,17 @@ typedef ::testing::Types< params, params, params, - params -> Params; + params> + Params; TYPED_TEST_CASE(RocprimWarpReduceTests, Params); -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_reduce_sum_kernel(T* device_input, T* device_output) +template +__global__ void warp_reduce_sum_kernel(T* device_input, T* device_output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T value = device_input[index]; @@ -100,21 +92,20 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output) __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, storage[warp_id]); - if(hipThreadIdx_x%LogicalWarpSize == 0) + if(hipThreadIdx_x % LogicalWarpSize == 0) { - device_output[index/LogicalWarpSize] = value; + device_output[index / LogicalWarpSize] = value; } } TYPED_TEST(RocprimWarpReduceTests, ReduceSum) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::params::type; + using T = typename TestFixture::params::type; constexpr size_t logical_warp_size = TestFixture::params::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; const size_t size = block_size * 4; // Given warp size not supported @@ -141,40 +132,34 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSum) } T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_reduce_sum_kernel), - dim3(size/block_size), dim3(block_size), 0, 0, - device_input, device_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_sum_kernel), + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { - if (std::is_integral::value) + if(std::is_integral::value) { ASSERT_EQ(output[i], expected[i]); } @@ -189,17 +174,12 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSum) HIP_CHECK(hipFree(device_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_allreduce_sum_kernel(T* device_input, T* device_output) +template +__global__ void warp_allreduce_sum_kernel(T* device_input, T* device_output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T value = device_input[index]; @@ -213,12 +193,11 @@ void warp_allreduce_sum_kernel(T* device_input, T* device_output) TYPED_TEST(RocprimWarpReduceTests, AllReduceSum) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::params::type; + using T = typename TestFixture::params::type; constexpr size_t logical_warp_size = TestFixture::params::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; const size_t size = block_size * 4; // Given warp size not supported @@ -241,48 +220,42 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSum) auto idx = i * logical_warp_size + j; value += input[idx]; } - for (size_t j = 0; j < logical_warp_size; j++) + for(size_t j = 0; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; + auto idx = i * logical_warp_size + j; expected[idx] = value; } } T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_allreduce_sum_kernel), - dim3(size/block_size), dim3(block_size), 0, 0, - device_input, device_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_allreduce_sum_kernel), + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { - if (std::is_integral::value) + if(std::is_integral::value) { ASSERT_EQ(output[i], expected[i]); } @@ -297,17 +270,12 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSum) HIP_CHECK(hipFree(device_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid) +template +__global__ void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T value = device_input[index]; @@ -315,22 +283,21 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid) __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, valid, storage[warp_id]); - if(hipThreadIdx_x%LogicalWarpSize == 0) + if(hipThreadIdx_x % LogicalWarpSize == 0) { - device_output[index/LogicalWarpSize] = value; + device_output[index / LogicalWarpSize] = value; } } TYPED_TEST(RocprimWarpReduceTests, ReduceSumValid) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::params::type; + using T = typename TestFixture::params::type; constexpr size_t logical_warp_size = TestFixture::params::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; - const size_t size = block_size * 4; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; + const size_t size = block_size * 4; const size_t valid = logical_warp_size - 1; // Given warp size not supported @@ -357,40 +324,35 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumValid) } T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_reduce_sum_kernel), - dim3(size/block_size), dim3(block_size), 0, 0, - device_input, device_output, valid - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_sum_kernel), + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + valid); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { - if (std::is_integral::value) + if(std::is_integral::value) { ASSERT_EQ(output[i], expected[i]); } @@ -405,17 +367,12 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumValid) HIP_CHECK(hipFree(device_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_allreduce_sum_kernel(T* device_input, T* device_output, size_t valid) +template +__global__ void warp_allreduce_sum_kernel(T* device_input, T* device_output, size_t valid) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T value = device_input[index]; @@ -429,13 +386,12 @@ void warp_allreduce_sum_kernel(T* device_input, T* device_output, size_t valid) TYPED_TEST(RocprimWarpReduceTests, AllReduceSumValid) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::params::type; + using T = typename TestFixture::params::type; constexpr size_t logical_warp_size = TestFixture::params::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; - const size_t size = block_size * 4; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; + const size_t size = block_size * 4; const size_t valid = logical_warp_size - 1; // Given warp size not supported @@ -458,48 +414,43 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSumValid) auto idx = i * logical_warp_size + j; value += input[idx]; } - for (size_t j = 0; j < logical_warp_size; j++) + for(size_t j = 0; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; + auto idx = i * logical_warp_size + j; expected[idx] = value; } } T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_allreduce_sum_kernel), - dim3(size/block_size), dim3(block_size), 0, 0, - device_input, device_output, valid - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_allreduce_sum_kernel), + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + valid); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { - if (std::is_integral::value) + if(std::is_integral::value) { ASSERT_EQ(output[i], expected[i]); } @@ -517,14 +468,13 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSumValid) TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct) { using base_type = typename TestFixture::params::type; - using T = test_utils::custom_test_type; + using T = test_utils::custom_test_type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::params::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; const size_t size = block_size * 4; // Given warp size not supported @@ -536,8 +486,7 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct) // Generate data std::vector input(size); { - auto random_values = - test_utils::get_random_data(2 * input.size(), 0, 100); + auto random_values = test_utils::get_random_data(2 * input.size(), 0, 100); for(size_t i = 0; i < input.size(); i++) { input[i].x = random_values[i]; @@ -554,51 +503,47 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct) for(size_t j = 0; j < logical_warp_size; j++) { auto idx = i * logical_warp_size + j; - value = value + input[idx]; + value = value + input[idx]; } expected[i] = value; } T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_reduce_sum_kernel), - dim3(size/block_size), dim3(block_size), 0, 0, - device_input, device_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_sum_kernel), + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { auto diffx = std::max(std::abs(0.1f * expected[i].x), base_type(0.01f)); - if(std::is_integral::value) diffx = 0; + if(std::is_integral::value) + diffx = 0; ASSERT_NEAR(output[i].x, expected[i].x, diffx); auto diffy = std::max(std::abs(0.1f * expected[i].y), base_type(0.01f)); - if(std::is_integral::value) diffy = 0; + if(std::is_integral::value) + diffy = 0; ASSERT_NEAR(output[i].y, expected[i].y, diffy); } @@ -606,21 +551,15 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct) HIP_CHECK(hipFree(device_output)); } -template< - class T, - class Flag, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) +template +__global__ void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); - T value = input[index]; - auto flag = flags[index]; + T value = input[index]; + auto flag = flags[index]; using wreduce_t = rp::warp_reduce; __shared__ typename wreduce_t::storage_type storage[warps_no]; @@ -632,13 +571,12 @@ void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::params::type; - using flag_type = unsigned char; + using T = typename TestFixture::params::type; + using flag_type = unsigned char; constexpr size_t logical_warp_size = TestFixture::params::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; const size_t size = block_size * 4; // Given warp size not supported @@ -648,47 +586,40 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) } // Generate data - std::vector input = test_utils::get_random_data(size, 1, 10); // used for input + std::vector input = test_utils::get_random_data(size, 1, 10); // used for input std::vector flags = test_utils::get_random_data01(size, 0.25f); - for(size_t i = 0; i < flags.size(); i+= logical_warp_size) + for(size_t i = 0; i < flags.size(); i += logical_warp_size) { flags[i] = 1; } std::vector output(input.size()); - T* device_input; + T* device_input; flag_type* device_flags; - T* device_output; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - HIP_CHECK(hipMalloc(&device_flags, flags.size() * sizeof(typename decltype(flags)::value_type))); + T* device_output; + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_flags, flags.data(), - flags.size() * sizeof(flag_type), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_flags, flags.size() * sizeof(typename decltype(flags)::value_type))); + HIP_CHECK( + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + device_flags, flags.data(), flags.size() * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host std::vector expected(output.size()); - size_t segment_head_index = 0; - T reduction = input[0]; + size_t segment_head_index = 0; + T reduction = input[0]; for(size_t i = 0; i < output.size(); i++) { - if(i%logical_warp_size == 0 || flags[i]) + if(i % logical_warp_size == 0 || flags[i]) { expected[segment_head_index] = reduction; - segment_head_index = i; - reduction = input[i]; + segment_head_index = i; + reduction = input[i]; } else { @@ -699,23 +630,21 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(head_segmented_warp_reduce_kernel< - T, flag_type, block_size, logical_warp_size - >), - dim3(size/block_size), dim3(block_size), 0, 0, - device_input, device_flags, device_output - ); + HIP_KERNEL_NAME( + head_segmented_warp_reduce_kernel), + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_input, + device_flags, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < output.size(); i++) @@ -723,7 +652,8 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) if(flags[i]) { auto diff = std::max(std::abs(0.1f * expected[i]), T(0.01f)); - if(std::is_integral::value) diff = 0; + if(std::is_integral::value) + diff = 0; ASSERT_NEAR(output[i], expected[i], diff) << " with index: " << index; } } @@ -733,21 +663,15 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) HIP_CHECK(hipFree(device_output)); } -template< - class T, - class Flag, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) +template +__global__ void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); - T value = input[index]; - auto flag = flags[index]; + T value = input[index]; + auto flag = flags[index]; using wreduce_t = rp::warp_reduce; __shared__ typename wreduce_t::storage_type storage[warps_no]; @@ -759,13 +683,12 @@ void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::params::type; - using flag_type = unsigned char; + using T = typename TestFixture::params::type; + using flag_type = unsigned char; constexpr size_t logical_warp_size = TestFixture::params::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; const size_t size = block_size * 4; // Given warp size not supported @@ -775,41 +698,34 @@ TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum) } // Generate data - std::vector input = test_utils::get_random_data(size, 1, 10); // used for input + std::vector input = test_utils::get_random_data(size, 1, 10); // used for input std::vector flags = test_utils::get_random_data01(size, 0.25f); - for(size_t i = logical_warp_size - 1; i < flags.size(); i+= logical_warp_size) + for(size_t i = logical_warp_size - 1; i < flags.size(); i += logical_warp_size) { flags[i] = 1; } std::vector output(input.size()); - T* device_input; + T* device_input; flag_type* device_flags; - T* device_output; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - HIP_CHECK(hipMalloc(&device_flags, flags.size() * sizeof(typename decltype(flags)::value_type))); + T* device_output; + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_flags, flags.data(), - flags.size() * sizeof(flag_type), - hipMemcpyHostToDevice - ) - ); + hipMalloc(&device_flags, flags.size() * sizeof(typename decltype(flags)::value_type))); + HIP_CHECK( + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy( + device_flags, flags.data(), flags.size() * sizeof(flag_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host - std::vector expected(output.size()); + std::vector expected(output.size()); std::vector segment_indexes; - size_t segment_index = 0; - T reduction; + size_t segment_index = 0; + T reduction; for(size_t i = 0; i < output.size(); i++) { // single value segments @@ -821,8 +737,8 @@ TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum) else { segment_index = i; - reduction = input[i]; - auto next = i + 1; + reduction = input[i]; + auto next = i + 1; while(next < output.size() && !flags[next]) { reduction = reduction + input[next]; @@ -837,30 +753,29 @@ TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum) // Launching kernel hipLaunchKernelGGL( - HIP_KERNEL_NAME(tail_segmented_warp_reduce_kernel< - T, flag_type, block_size, logical_warp_size - >), - dim3(size/block_size), dim3(block_size), 0, 0, - device_input, device_flags, device_output - ); + HIP_KERNEL_NAME( + tail_segmented_warp_reduce_kernel), + dim3(size / block_size), + dim3(block_size), + 0, + 0, + device_input, + device_flags, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < segment_indexes.size(); i++) { auto index = segment_indexes[i]; - auto diff = std::max(std::abs(0.1f * expected[i]), T(0.01f)); - if(std::is_integral::value) diff = 0; + auto diff = std::max(std::abs(0.1f * expected[i]), T(0.01f)); + if(std::is_integral::value) + diff = 0; ASSERT_NEAR(output[index], expected[index], diff) << " with index: " << index; } diff --git a/test/rocprim/test_warp_scan.cpp b/test/rocprim/test_warp_scan.cpp index 950182e43..115c38591 100644 --- a/test/rocprim/test_warp_scan.cpp +++ b/test/rocprim/test_warp_scan.cpp @@ -30,18 +30,15 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; // Params for tests -template< - class T, - unsigned int WarpSize -> +template struct params { - using type = T; + using type = T; static constexpr unsigned int warp_size = WarpSize; }; @@ -49,10 +46,11 @@ struct params // Test for scan ops taking single input value // --------------------------------------------------------- -template -class RocprimWarpScanTests : public ::testing::Test { +template +class RocprimWarpScanTests : public ::testing::Test +{ public: - using type = typename Params::type; + using type = typename Params::type; static constexpr unsigned int warp_size = Params::warp_size; }; @@ -88,21 +86,17 @@ typedef ::testing::Types< params, params -> RocprimWarpScanTestParams; + > + RocprimWarpScanTestParams; TYPED_TEST_CASE(RocprimWarpScanTests, RocprimWarpScanTestParams); -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_inclusive_scan_kernel(T* device_input, T* device_output) +template +__global__ void warp_inclusive_scan_kernel(T* device_input, T* device_output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T value = device_input[index]; @@ -118,12 +112,11 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScan) using T = typename TestFixture::type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; unsigned int grid_size = 4; - const size_t size = block_size * grid_size; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size()) @@ -141,53 +134,48 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScan) { for(size_t j = 0; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; - expected[idx] = input[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * logical_warp_size + j; + expected[idx] = input[idx] + expected[j > 0 ? idx - 1 : idx]; } } // Writing to device memory T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_inclusive_scan_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); // Validating results - if (std::is_integral::value) + if(std::is_integral::value) { for(size_t i = 0; i < output.size(); i++) { ASSERT_EQ(output[i], expected[i]); } } - else if (std::is_floating_point::value) + else if(std::is_floating_point::value) { for(size_t i = 0; i < output.size(); i++) { @@ -200,20 +188,14 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScan) HIP_CHECK(hipFree(device_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_inclusive_scan_reduce_kernel( - T* device_input, - T* device_output, - T* device_output_reductions) +template +__global__ void warp_inclusive_scan_reduce_kernel(T* device_input, + T* device_output, + T* device_output_reductions) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + ( hipBlockIdx_x * BlockSize ); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * BlockSize); T value = device_input[index]; T reduction; @@ -234,12 +216,11 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce) using T = typename TestFixture::type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; unsigned int grid_size = 4; - const size_t size = block_size * grid_size; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size()) @@ -259,62 +240,52 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce) { for(size_t j = 0; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; - expected[idx] = input[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * logical_warp_size + j; + expected[idx] = input[idx] + expected[j > 0 ? idx - 1 : idx]; } - expected_reductions[i] = expected[(i+1) * logical_warp_size - 1]; + expected_reductions[i] = expected[(i + 1) * logical_warp_size - 1]; } // Writing to device memory T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_reductions; HIP_CHECK( - hipMalloc( - &device_output_reductions, - output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(hipMalloc(&device_output_reductions, + output_reductions.size() + * sizeof(typename decltype(output_reductions)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_inclusive_scan_reduce_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, device_output_reductions - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_output_reductions); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results - if (std::is_integral::value) + if(std::is_integral::value) { for(size_t i = 0; i < output.size(); i++) { @@ -326,7 +297,7 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce) ASSERT_EQ(output_reductions[i], expected_reductions[i]); } } - else if (std::is_floating_point::value) + else if(std::is_floating_point::value) { for(size_t i = 0; i < output.size(); i++) { @@ -346,17 +317,12 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce) HIP_CHECK(hipFree(device_output_reductions)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_exclusive_scan_kernel(T* device_input, T* device_output, T init) +template +__global__ void warp_exclusive_scan_kernel(T* device_input, T* device_output, T init) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T value = device_input[index]; @@ -372,12 +338,11 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan) using T = typename TestFixture::type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; unsigned int grid_size = 4; - const size_t size = block_size * grid_size; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size()) @@ -389,7 +354,7 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan) std::vector input = test_utils::get_random_data(size, -100, 100); std::vector output(size); std::vector expected(input.size(), 0); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host for(size_t i = 0; i < input.size() / logical_warp_size; i++) @@ -397,53 +362,49 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan) expected[i * logical_warp_size] = init; for(size_t j = 1; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; - expected[idx] = input[idx-1] + expected[idx-1]; + auto idx = i * logical_warp_size + j; + expected[idx] = input[idx - 1] + expected[idx - 1]; } } // Writing to device memory T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_exclusive_scan_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, init - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); // Validating results - if (std::is_integral::value) + if(std::is_integral::value) { for(size_t i = 0; i < output.size(); i++) { ASSERT_EQ(output[i], expected[i]); } } - else if (std::is_floating_point::value) + else if(std::is_floating_point::value) { for(size_t i = 0; i < output.size(); i++) { @@ -456,21 +417,15 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan) HIP_CHECK(hipFree(device_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_exclusive_scan_reduce_kernel( - T* device_input, - T* device_output, - T* device_output_reductions, - T init) +template +__global__ void warp_exclusive_scan_reduce_kernel(T* device_input, + T* device_output, + T* device_output_reductions, + T init) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T value = device_input[index]; T reduction; @@ -491,12 +446,11 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) using T = typename TestFixture::type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; unsigned int grid_size = 4; - const size_t size = block_size * grid_size; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size()) @@ -510,7 +464,7 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) std::vector output_reductions(size / logical_warp_size); std::vector expected(input.size(), 0); std::vector expected_reductions(output_reductions.size(), 0); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host for(size_t i = 0; i < input.size() / logical_warp_size; i++) @@ -518,8 +472,8 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) expected[i * logical_warp_size] = init; for(size_t j = 1; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; - expected[idx] = input[idx-1] + expected[idx-1]; + auto idx = i * logical_warp_size + j; + expected[idx] = input[idx - 1] + expected[idx - 1]; } expected_reductions[i] = 0; @@ -532,54 +486,45 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) // Writing to device memory T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); - T* device_output_reductions; HIP_CHECK( - hipMalloc( - &device_output_reductions, - output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type) - ) - ); + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + T* device_output_reductions; + HIP_CHECK(hipMalloc(&device_output_reductions, + output_reductions.size() + * sizeof(typename decltype(output_reductions)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_exclusive_scan_reduce_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output, device_output_reductions, init - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output, + device_output_reductions, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results - if (std::is_integral::value) + if(std::is_integral::value) { for(size_t i = 0; i < output.size(); i++) { @@ -591,7 +536,7 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) ASSERT_EQ(output_reductions[i], expected_reductions[i]); } } - else if (std::is_floating_point::value) + else if(std::is_floating_point::value) { for(size_t i = 0; i < output.size(); i++) { @@ -611,21 +556,15 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) HIP_CHECK(hipFree(device_output_reductions)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_scan_kernel( - T* device_input, - T* device_inclusive_output, - T* device_exclusive_output, - T init) +template +__global__ void warp_scan_kernel(T* device_input, + T* device_inclusive_output, + T* device_exclusive_output, + T init) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T input = device_input[index]; T inclusive_output, exclusive_output; @@ -643,12 +582,11 @@ TYPED_TEST(RocprimWarpScanTests, Scan) using T = typename TestFixture::type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; unsigned int grid_size = 4; - const size_t size = block_size * grid_size; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size()) @@ -662,7 +600,7 @@ TYPED_TEST(RocprimWarpScanTests, Scan) std::vector output_exclusive(size); std::vector expected_inclusive(output_inclusive.size(), 0); std::vector expected_exclusive(output_exclusive.size(), 0); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host for(size_t i = 0; i < input.size() / logical_warp_size; i++) @@ -670,70 +608,58 @@ TYPED_TEST(RocprimWarpScanTests, Scan) expected_exclusive[i * logical_warp_size] = init; for(size_t j = 0; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; - expected_inclusive[idx] = input[idx] + expected_inclusive[j > 0 ? idx-1 : idx]; + auto idx = i * logical_warp_size + j; + expected_inclusive[idx] = input[idx] + expected_inclusive[j > 0 ? idx - 1 : idx]; if(j > 0) { - expected_exclusive[idx] = input[idx-1] + expected_exclusive[idx-1]; + expected_exclusive[idx] = input[idx - 1] + expected_exclusive[idx - 1]; } } } // Writing to device memory T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); - T* device_inclusive_output; HIP_CHECK( - hipMalloc( - &device_inclusive_output, - output_inclusive.size() * sizeof(typename decltype(output_inclusive)::value_type) - ) - ); + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + T* device_inclusive_output; + HIP_CHECK(hipMalloc(&device_inclusive_output, + output_inclusive.size() + * sizeof(typename decltype(output_inclusive)::value_type))); T* device_exclusive_output; - HIP_CHECK( - hipMalloc( - &device_exclusive_output, - output_exclusive.size() * sizeof(typename decltype(output_exclusive)::value_type) - ) - ); + HIP_CHECK(hipMalloc(&device_exclusive_output, + output_exclusive.size() + * sizeof(typename decltype(output_exclusive)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_scan_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_inclusive_output, device_exclusive_output, init - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_scan_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_inclusive_output, + device_exclusive_output, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output_inclusive.data(), device_inclusive_output, - output_inclusive.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_inclusive.data(), + device_inclusive_output, + output_inclusive.size() * sizeof(T), + hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_exclusive.data(), device_exclusive_output, - output_exclusive.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_exclusive.data(), + device_exclusive_output, + output_exclusive.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results - if (std::is_integral::value) + if(std::is_integral::value) { for(size_t i = 0; i < output_inclusive.size(); i++) { @@ -741,7 +667,7 @@ TYPED_TEST(RocprimWarpScanTests, Scan) ASSERT_EQ(output_exclusive[i], expected_exclusive[i]); } } - else if (std::is_floating_point::value) + else if(std::is_floating_point::value) { for(size_t i = 0; i < output_inclusive.size(); i++) { @@ -758,22 +684,16 @@ TYPED_TEST(RocprimWarpScanTests, Scan) HIP_CHECK(hipFree(device_exclusive_output)); } -template< - class T, - unsigned int BlockSize, - unsigned int LogicalWarpSize -> -__global__ -void warp_scan_reduce_kernel( - T* device_input, - T* device_inclusive_output, - T* device_exclusive_output, - T* device_output_reductions, - T init) +template +__global__ void warp_scan_reduce_kernel(T* device_input, + T* device_inclusive_output, + T* device_exclusive_output, + T* device_output_reductions, + T init) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; - const unsigned int warp_id = rp::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + const unsigned int warp_id = rp::detail::logical_warp_id(); + unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); T input = device_input[index]; T inclusive_output, exclusive_output, reduction; @@ -795,12 +715,11 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce) using T = typename TestFixture::type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; unsigned int grid_size = 4; - const size_t size = block_size * grid_size; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size()) @@ -816,7 +735,7 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce) std::vector expected_inclusive(output_inclusive.size(), 0); std::vector expected_exclusive(output_exclusive.size(), 0); std::vector expected_reductions(output_reductions.size(), 0); - const T init = test_utils::get_random_value(0, 100); + const T init = test_utils::get_random_value(0, 100); // Calculate expected results on host for(size_t i = 0; i < input.size() / logical_warp_size; i++) @@ -824,87 +743,69 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce) expected_exclusive[i * logical_warp_size] = init; for(size_t j = 0; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; - expected_inclusive[idx] = input[idx] + expected_inclusive[j > 0 ? idx-1 : idx]; + auto idx = i * logical_warp_size + j; + expected_inclusive[idx] = input[idx] + expected_inclusive[j > 0 ? idx - 1 : idx]; if(j > 0) { - expected_exclusive[idx] = input[idx-1] + expected_exclusive[idx-1]; + expected_exclusive[idx] = input[idx - 1] + expected_exclusive[idx - 1]; } } - expected_reductions[i] = expected_inclusive[(i+1) * logical_warp_size - 1]; + expected_reductions[i] = expected_inclusive[(i + 1) * logical_warp_size - 1]; } // Writing to device memory T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); - T* device_inclusive_output; HIP_CHECK( - hipMalloc( - &device_inclusive_output, - output_inclusive.size() * sizeof(typename decltype(output_inclusive)::value_type) - ) - ); + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + T* device_inclusive_output; + HIP_CHECK(hipMalloc(&device_inclusive_output, + output_inclusive.size() + * sizeof(typename decltype(output_inclusive)::value_type))); T* device_exclusive_output; - HIP_CHECK( - hipMalloc( - &device_exclusive_output, - output_exclusive.size() * sizeof(typename decltype(output_exclusive)::value_type) - ) - ); + HIP_CHECK(hipMalloc(&device_exclusive_output, + output_exclusive.size() + * sizeof(typename decltype(output_exclusive)::value_type))); T* device_output_reductions; - HIP_CHECK( - hipMalloc( - &device_output_reductions, - output_reductions.size() * sizeof(typename decltype(output_reductions)::value_type) - ) - ); + HIP_CHECK(hipMalloc(&device_output_reductions, + output_reductions.size() + * sizeof(typename decltype(output_reductions)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(warp_scan_reduce_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, - device_inclusive_output, device_exclusive_output, device_output_reductions, init - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_scan_reduce_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_inclusive_output, + device_exclusive_output, + device_output_reductions, + init); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output_inclusive.data(), device_inclusive_output, - output_inclusive.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_inclusive.data(), + device_inclusive_output, + output_inclusive.size() * sizeof(T), + hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_exclusive.data(), device_exclusive_output, - output_exclusive.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_exclusive.data(), + device_exclusive_output, + output_exclusive.size() * sizeof(T), + hipMemcpyDeviceToHost)); - HIP_CHECK( - hipMemcpy( - output_reductions.data(), device_output_reductions, - output_reductions.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output_reductions.data(), + device_output_reductions, + output_reductions.size() * sizeof(T), + hipMemcpyDeviceToHost)); // Validating results - if (std::is_integral::value) + if(std::is_integral::value) { for(size_t i = 0; i < output_inclusive.size(); i++) { @@ -917,7 +818,7 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce) ASSERT_EQ(output_reductions[i], expected_reductions[i]); } } - else if (std::is_floating_point::value) + else if(std::is_floating_point::value) { for(size_t i = 0; i < output_inclusive.size(); i++) { @@ -943,15 +844,14 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce) TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType) { using base_type = typename TestFixture::type; - using T = test_utils::custom_test_type; + using T = test_utils::custom_test_type; // logical warp side for warp primitive, execution warp size is always rp::warp_size() constexpr size_t logical_warp_size = TestFixture::warp_size; - constexpr size_t block_size = - rp::detail::is_power_of_two(logical_warp_size) - ? rp::max(rp::warp_size(), logical_warp_size * 4) - : (rp::warp_size()/logical_warp_size) * logical_warp_size; + constexpr size_t block_size = rp::detail::is_power_of_two(logical_warp_size) + ? rp::max(rp::warp_size(), logical_warp_size * 4) + : (rp::warp_size() / logical_warp_size) * logical_warp_size; unsigned int grid_size = 4; - const size_t size = block_size * grid_size; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size()) @@ -966,8 +866,7 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType) // Initializing input data { - auto random_values = - test_utils::get_random_data(2 * input.size(), 0, 100); + auto random_values = test_utils::get_random_data(2 * input.size(), 0, 100); for(size_t i = 0; i < input.size(); i++) { input[i].x = random_values[i]; @@ -980,58 +879,55 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType) { for(size_t j = 0; j < logical_warp_size; j++) { - auto idx = i * logical_warp_size + j; - expected[idx] = input[idx] + expected[j > 0 ? idx-1 : idx]; + auto idx = i * logical_warp_size + j; + expected[idx] = input[idx] + expected[j > 0 ? idx - 1 : idx]; } } // Writing to device memory T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK( + hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); + HIP_CHECK( + hipMalloc(&device_output, output.size() * sizeof(typename decltype(output)::value_type))); HIP_CHECK( - hipMemcpy( - device_input, input.data(), - input.size() * sizeof(T), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(device_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); // Launching kernel hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_inclusive_scan_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - device_input, device_output - ); + dim3(grid_size), + dim3(block_size), + 0, + 0, + device_input, + device_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory HIP_CHECK( - hipMemcpy( - output.data(), device_output, - output.size() * sizeof(T), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output.data(), device_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); // Validating results - if (std::is_integral::value) + if(std::is_integral::value) { for(size_t i = 0; i < output.size(); i++) { ASSERT_EQ(output[i], expected[i]); } } - else if (std::is_floating_point::value) + else if(std::is_floating_point::value) { for(size_t i = 0; i < output.size(); i++) { - auto tolerance_x = std::max(std::abs(0.1f * expected[i].x), base_type(0.01f)); - auto tolerance_y = std::max(std::abs(0.1f * expected[i].y), base_type(0.01f)); + auto tolerance_x + = std::max(std::abs(0.1f * expected[i].x), base_type(0.01f)); + auto tolerance_y + = std::max(std::abs(0.1f * expected[i].y), base_type(0.01f)); ASSERT_NEAR(output[i].x, expected[i].x, tolerance_x); ASSERT_NEAR(output[i].y, expected[i].y, tolerance_y); } @@ -1040,4 +936,3 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType) HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_output)); } - diff --git a/test/rocprim/test_warp_sort.cpp b/test/rocprim/test_warp_sort.cpp index 03484e4f2..d8020d3b9 100644 --- a/test/rocprim/test_warp_sort.cpp +++ b/test/rocprim/test_warp_sort.cpp @@ -33,25 +33,26 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) namespace rp = rocprim; -template +template struct params { - using type = T; + using type = T; static constexpr unsigned int warp_size = WarpSize; }; -template -class RocprimWarpSortShuffleBasedTests : public ::testing::Test { +template +class RocprimWarpSortShuffleBasedTests : public ::testing::Test +{ public: - using type = typename Params::type; + using type = typename Params::type; static constexpr unsigned int warp_size = Params::warp_size; }; -template +template bool test(const T& a, const T& b) { return a < b; @@ -73,19 +74,16 @@ typedef ::testing::Types< params, 32U>, params, 64U> -> WarpSizes; + > + WarpSizes; TYPED_TEST_CASE(RocprimWarpSortShuffleBasedTests, WarpSizes); -template< - class T, - unsigned int LogicalWarpSize -> -__global__ -void test_hip_warp_sort(T* d_output) +template +__global__ void test_hip_warp_sort(T* d_output) { - unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); - T value = d_output[i]; + unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + T value = d_output[i]; rp::warp_sort wsort; wsort.sort(value); d_output[i] = value; @@ -94,11 +92,11 @@ void test_hip_warp_sort(T* d_output) TYPED_TEST(RocprimWarpSortShuffleBasedTests, Sort) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::type; + using T = typename TestFixture::type; constexpr size_t logical_warp_size = TestFixture::warp_size; - const size_t block_size = std::max(rp::warp_size(), 4 * logical_warp_size); - constexpr size_t grid_size = 4; - const size_t size = block_size * grid_size; + const size_t block_size = std::max(rp::warp_size(), 4 * logical_warp_size); + constexpr size_t grid_size = 4; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size() || !rp::detail::is_power_of_two(logical_warp_size)) @@ -114,41 +112,35 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, Sort) for(size_t i = 0; i < output.size() / logical_warp_size; i++) { - std::sort(expected.begin() + (i * logical_warp_size), expected.begin() + ((i + 1) * logical_warp_size)); + std::sort(expected.begin() + (i * logical_warp_size), + expected.begin() + ((i + 1) * logical_warp_size)); } // Writing to device memory T* d_output; - HIP_CHECK( - hipMalloc(&d_output, output.size() * sizeof(typename decltype(output)::value_type)) - ); - - HIP_CHECK( - hipMemcpy( - d_output, output.data(), - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(typename decltype(output)::value_type))); + + HIP_CHECK(hipMemcpy(d_output, + output.data(), + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyHostToDevice)); // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(test_hip_warp_sort), - dim3(grid_size), dim3(block_size), 0, 0, - d_output - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_hip_warp_sort), + dim3(grid_size), + dim3(block_size), + 0, + 0, + d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); // Read from device memory - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(typename decltype(output)::value_type), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), + d_output, + output.size() * sizeof(typename decltype(output)::value_type), + hipMemcpyDeviceToHost)); for(size_t i = 0; i < output.size(); i++) { @@ -156,31 +148,26 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, Sort) } } -template< - class KeyType, - class ValueType, - unsigned int LogicalWarpSize -> -__global__ -void test_hip_sort_key_value_kernel(KeyType* d_output_key, ValueType* d_output_value) +template +__global__ void test_hip_sort_key_value_kernel(KeyType* d_output_key, ValueType* d_output_value) { - unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); - KeyType key = d_output_key[i]; - ValueType value = d_output_value[i]; + unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + KeyType key = d_output_key[i]; + ValueType value = d_output_value[i]; rp::warp_sort wsort; wsort.sort(key, value); - d_output_key[i] = key; + d_output_key[i] = key; d_output_value[i] = value; } TYPED_TEST(RocprimWarpSortShuffleBasedTests, SortKeyInt) { // logical warp side for warp primitive, execution warp size is always rp::warp_size() - using T = typename TestFixture::type; + using T = typename TestFixture::type; constexpr size_t logical_warp_size = TestFixture::warp_size; - const size_t block_size = std::max(rp::warp_size(), 4 * logical_warp_size); - constexpr size_t grid_size = 4; - const size_t size = block_size * grid_size; + const size_t block_size = std::max(rp::warp_size(), 4 * logical_warp_size); + constexpr size_t grid_size = 4; + const size_t size = block_size * grid_size; // Given warp size not supported if(logical_warp_size > rp::warp_size() || !rp::detail::is_power_of_two(logical_warp_size)) @@ -191,12 +178,12 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, SortKeyInt) // Generate data std::vector output_key(size); std::iota(output_key.begin(), output_key.end(), 0); - std::shuffle(output_key.begin(), output_key.end(), std::mt19937{std::random_device{}()}); + std::shuffle(output_key.begin(), output_key.end(), std::mt19937 {std::random_device {}()}); std::vector output_value = test_utils::get_random_data(size, -100, 100); // Combine vectors to form pairs with key and value std::vector> target(size); - for (unsigned i = 0; i < target.size(); i++) + for(unsigned i = 0; i < target.size(); i++) target[i] = std::make_pair(output_key[i], output_value[i]); // Calculate expected results on host @@ -204,62 +191,51 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, SortKeyInt) for(size_t i = 0; i < expected.size() / logical_warp_size; i++) { - std::sort(expected.begin() + (i * logical_warp_size), expected.begin() + ((i + 1) * logical_warp_size)); + std::sort(expected.begin() + (i * logical_warp_size), + expected.begin() + ((i + 1) * logical_warp_size)); } // Writing to device memory T* d_output_key; T* d_output_value; - HIP_CHECK( - hipMalloc(&d_output_key, output_key.size() * sizeof(typename decltype(output_key)::value_type)) - ); - HIP_CHECK( - hipMalloc(&d_output_value, output_value.size() * sizeof(typename decltype(output_value)::value_type)) - ); - - HIP_CHECK( - hipMemcpy( - d_output_key, output_key.data(), - output_key.size() * sizeof(typename decltype(output_key)::value_type), - hipMemcpyHostToDevice - ) - ); - - HIP_CHECK( - hipMemcpy( - d_output_value, output_value.data(), - output_value.size() * sizeof(typename decltype(output_value)::value_type), - hipMemcpyHostToDevice - ) - ); - + HIP_CHECK(hipMalloc(&d_output_key, + output_key.size() * sizeof(typename decltype(output_key)::value_type))); + HIP_CHECK(hipMalloc(&d_output_value, + output_value.size() * sizeof(typename decltype(output_value)::value_type))); + + HIP_CHECK(hipMemcpy(d_output_key, + output_key.data(), + output_key.size() * sizeof(typename decltype(output_key)::value_type), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemcpy(d_output_value, + output_value.data(), + output_value.size() * sizeof(typename decltype(output_value)::value_type), + hipMemcpyHostToDevice)); + // Launching kernel - hipLaunchKernelGGL( - HIP_KERNEL_NAME(test_hip_sort_key_value_kernel), - dim3(grid_size), dim3(block_size), 0, 0, - d_output_key, d_output_value - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(test_hip_sort_key_value_kernel), + dim3(grid_size), + dim3(block_size), + 0, + 0, + d_output_key, + d_output_value); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); - + // Read from device memory - HIP_CHECK( - hipMemcpy( - output_key.data(), d_output_key, - output_key.size() * sizeof(typename decltype(output_key)::value_type), - hipMemcpyDeviceToHost - ) - ); - - HIP_CHECK( - hipMemcpy( - output_value.data(), d_output_value, - output_value.size() * sizeof(typename decltype(output_value)::value_type), - hipMemcpyDeviceToHost - ) - ); - + HIP_CHECK(hipMemcpy(output_key.data(), + d_output_key, + output_key.size() * sizeof(typename decltype(output_key)::value_type), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipMemcpy(output_value.data(), + d_output_value, + output_value.size() * sizeof(typename decltype(output_value)::value_type), + hipMemcpyDeviceToHost)); + for(size_t i = 0; i < expected.size(); i++) { ASSERT_EQ(output_key[i], expected[i].first); diff --git a/test/rocprim/test_zip_iterator.cpp b/test/rocprim/test_zip_iterator.cpp index 79fca0416..5c8c42beb 100644 --- a/test/rocprim/test_zip_iterator.cpp +++ b/test/rocprim/test_zip_iterator.cpp @@ -30,49 +30,33 @@ #include "test_utils.hpp" -#define HIP_CHECK(error) ASSERT_EQ(static_cast(error),hipSuccess) +#define HIP_CHECK(error) ASSERT_EQ(static_cast(error), hipSuccess) TEST(RocprimZipIteratorTests, Traits) { - ASSERT_TRUE(( - std::is_same< - rocprim::zip_iterator< - rocprim::tuple - >::reference, - rocprim::tuple - >::value - )); - ASSERT_TRUE(( - std::is_same< - rocprim::zip_iterator< - rocprim::tuple - >::reference, - rocprim::tuple - >::value - )); + ASSERT_TRUE( + (std::is_same>::reference, + rocprim::tuple>::value)); + ASSERT_TRUE((std::is_same< + rocprim::zip_iterator>::reference, + rocprim::tuple>::value)); auto to_double = [](const int& x) -> double { return double(x); }; - ASSERT_TRUE(( - std::is_same< + ASSERT_TRUE( + (std::is_same< rocprim::zip_iterator< - rocprim::tuple< - rocprim::counting_iterator, - rocprim::transform_iterator - > - >::reference, - rocprim::tuple< - rocprim::counting_iterator::reference, - rocprim::transform_iterator::reference - > - >::value - )); + rocprim::tuple, + rocprim::transform_iterator>>::reference, + rocprim::tuple::reference, + rocprim::transform_iterator::reference>>:: + value)); } TEST(RocprimZipIteratorTests, Basics) { - int a[] = { 1, 2, 3, 4, 5}; - int b[] = { 6, 7, 8, 9, 10}; - double c[] = { 1., 2., 3., 4., 5.}; - auto iterator_tuple = rocprim::make_tuple(a, b, c); + int a[] = {1, 2, 3, 4, 5}; + int b[] = {6, 7, 8, 9, 10}; + double c[] = {1., 2., 3., 4., 5.}; + auto iterator_tuple = rocprim::make_tuple(a, b, c); // Constructor rocprim::zip_iterator zit(iterator_tuple); @@ -85,7 +69,7 @@ TEST(RocprimZipIteratorTests, Basics) ASSERT_EQ(b[0], 8); ASSERT_EQ(c[0], 15.0); auto ref = *zit; - ref = rocprim::make_tuple(1, 6, 1.0); + ref = rocprim::make_tuple(1, 6, 1.0); ASSERT_EQ(*zit, rocprim::make_tuple(1, 6, 1.0)); ASSERT_EQ(a[0], 1); ASSERT_EQ(b[0], 6); @@ -134,14 +118,13 @@ TEST(RocprimZipIteratorTests, Basics) ASSERT_EQ((zit2[0]), rocprim::make_tuple(1, 6, 1.0)); ASSERT_EQ((zit2[2]), rocprim::make_tuple(3, 8, 3.0)); // + - ASSERT_EQ(*(zit2+3), rocprim::make_tuple(4, 9, 4.0)); + ASSERT_EQ(*(zit2 + 3), rocprim::make_tuple(4, 9, 4.0)); } -template +template struct tuple3_transform_op { - __device__ __host__ - T1 operator()(const rocprim::tuple& t) const + __device__ __host__ T1 operator()(const rocprim::tuple& t) const { return T1(rocprim::get<0>(t) + rocprim::get<1>(t) + rocprim::get<2>(t)); } @@ -149,12 +132,12 @@ struct tuple3_transform_op TEST(RocprimZipIteratorTests, Transform) { - using T1 = int; - using T2 = double; - using T3 = unsigned char; - using U = T1; - const bool debug_synchronous = false; - const size_t size = 1024 * 16; + using T1 = int; + using T2 = double; + using T3 = unsigned char; + using U = T1; + const bool debug_synchronous = false; + const size_t size = 1024 * 16; // using default stream hipStream_t stream = 0; @@ -163,84 +146,53 @@ TEST(RocprimZipIteratorTests, Transform) std::vector input1 = test_utils::get_random_data(size, 1, 100); std::vector input2 = test_utils::get_random_data(size, 1, 100); std::vector input3 = test_utils::get_random_data(size, 1, 100); - std::vector output(input1.size()); + std::vector output(input1.size()); - T1 * d_input1; - T2 * d_input2; - T3 * d_input3; - U * d_output; + T1* d_input1; + T2* d_input2; + T3* d_input3; + U* d_output; HIP_CHECK(hipMalloc(&d_input1, input1.size() * sizeof(T1))); HIP_CHECK(hipMalloc(&d_input2, input2.size() * sizeof(T2))); HIP_CHECK(hipMalloc(&d_input3, input3.size() * sizeof(T3))); HIP_CHECK(hipMalloc(&d_output, output.size() * sizeof(U))); HIP_CHECK( - hipMemcpy( - d_input1, input1.data(), - input1.size() * sizeof(T1), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input1, input1.data(), input1.size() * sizeof(T1), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input2, input2.data(), - input2.size() * sizeof(T2), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input2, input2.data(), input2.size() * sizeof(T2), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input3, input3.data(), - input3.size() * sizeof(T3), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input3, input3.data(), input3.size() * sizeof(T3), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host std::vector expected(input1.size()); std::transform( rocprim::make_zip_iterator( - rocprim::make_tuple(input1.begin(), input2.begin(), input3.begin()) - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(input1.end(), input2.end(), input3.end()) - ), + rocprim::make_tuple(input1.begin(), input2.begin(), input3.begin())), + rocprim::make_zip_iterator(rocprim::make_tuple(input1.end(), input2.end(), input3.end())), expected.begin(), - tuple3_transform_op() - ); + tuple3_transform_op()); // Run - HIP_CHECK( - rocprim::transform( - rocprim::make_zip_iterator( - rocprim::make_tuple( - d_input1, d_input2, d_input3 - ) - ), - d_output, - input1.size(), - tuple3_transform_op(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::transform( + rocprim::make_zip_iterator(rocprim::make_tuple(d_input1, d_input2, d_input3)), + d_output, + input1.size(), + tuple3_transform_op(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host - HIP_CHECK( - hipMemcpy( - output.data(), d_output, - output.size() * sizeof(U), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(output.data(), d_output, output.size() * sizeof(U), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected for(size_t i = 0; i < output.size(); i++) { auto diff = std::max(std::abs(0.01f * expected[i]), U(0.01f)); - if(std::is_integral::value) diff = 0; + if(std::is_integral::value) + diff = 0; ASSERT_NEAR(output[i], expected[i], diff) << "where index = " << i; } @@ -250,41 +202,36 @@ TEST(RocprimZipIteratorTests, Transform) hipFree(d_output); } -template +template struct tuple3to2_transform_op { - __device__ __host__ inline - rocprim::tuple operator()(const rocprim::tuple& t) const + __device__ __host__ inline rocprim::tuple + operator()(const rocprim::tuple& t) const { - return rocprim::make_tuple( - rocprim::get<0>(t), T2(rocprim::get<1>(t) + rocprim::get<2>(t)) - ); + return rocprim::make_tuple(rocprim::get<0>(t), T2(rocprim::get<1>(t) + rocprim::get<2>(t))); } }; -template +template struct tuple2_reduce_op { - __device__ __host__ inline - rocprim::tuple operator()(const rocprim::tuple& t1, - const rocprim::tuple& t2) const + __device__ __host__ inline rocprim::tuple + operator()(const rocprim::tuple& t1, const rocprim::tuple& t2) const { - return rocprim::make_tuple( - rocprim::get<0>(t1) + rocprim::get<0>(t2), - rocprim::get<1>(t1) + rocprim::get<1>(t2) - ); + return rocprim::make_tuple(rocprim::get<0>(t1) + rocprim::get<0>(t2), + rocprim::get<1>(t1) + rocprim::get<1>(t2)); }; }; TEST(RocprimZipIteratorTests, TransformReduce) { - using T1 = int; - using T2 = unsigned int; - using T3 = unsigned char; - using U1 = T1; - using U2 = T2; - const bool debug_synchronous = false; - const size_t size = 1024 * 16; + using T1 = int; + using T2 = unsigned int; + using T3 = unsigned char; + using U1 = T1; + using U2 = T2; + const bool debug_synchronous = false; + const size_t size = 1024 * 16; // using default stream hipStream_t stream = 0; @@ -309,112 +256,73 @@ TEST(RocprimZipIteratorTests, TransformReduce) // Copy input data to device HIP_CHECK( - hipMemcpy( - d_input1, input1.data(), - input1.size() * sizeof(T1), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input1, input1.data(), input1.size() * sizeof(T1), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input2, input2.data(), - input2.size() * sizeof(T2), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input2, input2.data(), input2.size() * sizeof(T2), hipMemcpyHostToDevice)); HIP_CHECK( - hipMemcpy( - d_input3, input3.data(), - input3.size() * sizeof(T3), - hipMemcpyHostToDevice - ) - ); + hipMemcpy(d_input3, input3.data(), input3.size() * sizeof(T3), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Calculate expected results on host U1 expected1 = std::accumulate(input1.begin(), input1.end(), T1(0)); U2 expected2 = std::accumulate(input2.begin(), input2.end(), T2(0)) - + std::accumulate(input3.begin(), input3.end(), T2(0)); + + std::accumulate(input3.begin(), input3.end(), T2(0)); // temp storage size_t temp_storage_size_bytes; // Get size of d_temp_storage - HIP_CHECK( - rocprim::reduce( - nullptr, - temp_storage_size_bytes, - rocprim::make_transform_iterator( - rocprim::make_zip_iterator( - rocprim::make_tuple(d_input1, d_input2, d_input3) - ), - tuple3to2_transform_op() - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(d_output1, d_output2) - ), - input1.size(), - tuple2_reduce_op(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce( + nullptr, + temp_storage_size_bytes, + rocprim::make_transform_iterator( + rocprim::make_zip_iterator(rocprim::make_tuple(d_input1, d_input2, d_input3)), + tuple3to2_transform_op()), + rocprim::make_zip_iterator(rocprim::make_tuple(d_output1, d_output2)), + input1.size(), + tuple2_reduce_op(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // temp_storage_size_bytes must be >0 ASSERT_GT(temp_storage_size_bytes, 0); // allocate temporary storage - void * d_temp_storage = nullptr; + void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); ASSERT_NE(d_temp_storage, nullptr); // Run - HIP_CHECK( - rocprim::reduce( - d_temp_storage, - temp_storage_size_bytes, - rocprim::make_transform_iterator( - rocprim::make_zip_iterator( - rocprim::make_tuple(d_input1, d_input2, d_input3) - ), - tuple3to2_transform_op() - ), - rocprim::make_zip_iterator( - rocprim::make_tuple(d_output1, d_output2) - ), - input1.size(), - tuple2_reduce_op(), - stream, - debug_synchronous - ) - ); + HIP_CHECK(rocprim::reduce( + d_temp_storage, + temp_storage_size_bytes, + rocprim::make_transform_iterator( + rocprim::make_zip_iterator(rocprim::make_tuple(d_input1, d_input2, d_input3)), + tuple3to2_transform_op()), + rocprim::make_zip_iterator(rocprim::make_tuple(d_output1, d_output2)), + input1.size(), + tuple2_reduce_op(), + stream, + debug_synchronous)); HIP_CHECK(hipDeviceSynchronize()); // Copy output to host HIP_CHECK( - hipMemcpy( - output1.data(), d_output1, - output1.size() * sizeof(U1), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output1.data(), d_output1, output1.size() * sizeof(U1), hipMemcpyDeviceToHost)); HIP_CHECK( - hipMemcpy( - output2.data(), d_output2, - output2.size() * sizeof(U2), - hipMemcpyDeviceToHost - ) - ); + hipMemcpy(output2.data(), d_output2, output2.size() * sizeof(U2), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); // Check if output values are as expected auto diff1 = std::max(std::abs(0.01f * expected1), U1(0.01f)); - if(std::is_integral::value) diff1 = 0; + if(std::is_integral::value) + diff1 = 0; ASSERT_NEAR(output1[0], expected1, diff1); auto diff2 = std::max(std::abs(0.01f * expected2), U2(0.01f)); - if(std::is_integral::value) diff2 = 0; + if(std::is_integral::value) + diff2 = 0; ASSERT_NEAR(output2[0], expected2, diff2); hipFree(d_input1); diff --git a/test/test_hip_api.cpp b/test/test_hip_api.cpp index 45fb90c34..5c5d859cd 100644 --- a/test/test_hip_api.cpp +++ b/test/test_hip_api.cpp @@ -30,15 +30,14 @@ #include #define HIP_CHECK(x) ASSERT_EQ(x, hipSuccess) -template +template T ax(const T a, const T x) __device__ { return x * a; } template -__global__ -void saxpy_kernel(const T * x, T * y, const T a, const size_t size) +__global__ void saxpy_kernel(const T* x, T* y, const T a, const size_t size) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(i < size) @@ -51,44 +50,30 @@ TEST(HIPTests, Saxpy) { const size_t N = 100; - const float a = 100.0f; + const float a = 100.0f; std::vector x(N, 2.0f); std::vector y(N, 1.0f); - float * d_x; - float * d_y; + float* d_x; + float* d_y; HIP_CHECK(hipMalloc(&d_x, N * sizeof(float))); HIP_CHECK(hipMalloc(&d_y, N * sizeof(float))); - HIP_CHECK( - hipMemcpy( - d_x, x.data(), - N * sizeof(float), - hipMemcpyHostToDevice - ) - ); - HIP_CHECK( - hipMemcpy( - d_y, y.data(), - N * sizeof(float), - hipMemcpyHostToDevice - ) - ); + HIP_CHECK(hipMemcpy(d_x, x.data(), N * sizeof(float), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_y, y.data(), N * sizeof(float), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(saxpy_kernel), - dim3((N + 255)/256), dim3(256), 0, 0, - d_x, d_y, a, N - ); + hipLaunchKernelGGL(HIP_KERNEL_NAME(saxpy_kernel), + dim3((N + 255) / 256), + dim3(256), + 0, + 0, + d_x, + d_y, + a, + N); HIP_CHECK(hipPeekAtLastError()); - HIP_CHECK( - hipMemcpy( - y.data(), d_y, - N * sizeof(float), - hipMemcpyDeviceToHost - ) - ); + HIP_CHECK(hipMemcpy(y.data(), d_y, N * sizeof(float), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipFree(d_x)); HIP_CHECK(hipFree(d_y));