Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
41ff492
Merge pull request #44 from ROCmSoftwarePlatform/develop
ex-rzr Feb 6, 2019
3584f6a
Fix key-value merge, add device functions with tests and benchmarks
ex-rzr Feb 7, 2019
bca53dc
Merge branch 'device_merge_by_key' into 'thrust'
Feb 11, 2019
0a17a39
Optimize device transform
ex-rzr Feb 12, 2019
90cbe81
Implement device-level binary search, lower and upper bound
ex-rzr Feb 12, 2019
635b037
Optimize binary search for (nearly-)power-of-two sizes of haystack
ex-rzr Feb 12, 2019
b84e95a
Force use of ROCm 2.2 beta and allow failure for S9300 and
neon60 Feb 22, 2019
a47d6e8
Merge branch 'device_binary_search' into 'thrust'
neon60 Feb 22, 2019
5936b87
Fix device reduce_by_key for non-unique keys, add tests for such cases
ex-rzr Mar 8, 2019
f11af28
Use BinaryFunction result type in match_result_type, remove OutputType
ex-rzr Mar 8, 2019
f08ba34
Support CUB's behavior for types of intermediate results in hipCUB
ex-rzr Mar 8, 2019
c274396
Merge branch 'fix_reduce_by_key_nonunique_keys' into 'thrust'
neon60 Mar 8, 2019
4face50
Merge branch 'fix_match_result_type' into 'thrust'
neon60 Mar 11, 2019
4319c25
Fix device-scan for scan operations that access memory
ex-rzr Mar 7, 2019
942aa5c
Fix device scan_by_key for non-unique keys, support in-place scan
ex-rzr Mar 12, 2019
d202522
Support in-place segmented scan_by_key
ex-rzr Mar 12, 2019
2a29b39
Merge branch 'fix_scan_by_key_nonunique_keys' into 'thrust'
neon60 Mar 12, 2019
c4e46f0
Remove temporary ROCm 2.2 repo for 243 server
VincentSC Mar 18, 2019
06fa4c2
Merge pull request #52 from ROCmSoftwarePlatform/develop
saadrahim Apr 15, 2019
ed15db4
Merge pull request #56 from ROCmSoftwarePlatform/develop
VincentSC Apr 17, 2019
ea5cf85
Merge branch 'thrust' into develop_stream
neon60 Apr 18, 2019
0e94482
Merge branch 'merge_thrust' into 'develop_stream'
ex-rzr Apr 25, 2019
94f161f
Merge branch 'master' into 'develop_stream'
ex-rzr Apr 25, 2019
b38041a
Support only integer types in counting_iterator
ex-rzr Apr 24, 2019
afd18f5
Do not use output type in partition to support Thrust discard_iterator
ex-rzr Apr 24, 2019
6274224
Do not use __global__ static member functions in benchmarks
ex-rzr Apr 25, 2019
c05cf01
Merge branch 'fix_partition_with_discard_iterator' into 'develop_stream'
neon60 Apr 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ add_rocprim_benchmark_hip(benchmark_hip_block_radix_sort.cpp)
add_rocprim_benchmark_hip(benchmark_hip_block_reduce.cpp)
add_rocprim_benchmark_hip(benchmark_hip_block_scan.cpp)
add_rocprim_benchmark_hip(benchmark_hip_block_sort.cpp)
add_rocprim_benchmark_hip(benchmark_hip_device_binary_search.cpp)
add_rocprim_benchmark_hip(benchmark_hip_device_histogram.cpp)
add_rocprim_benchmark_hip(benchmark_hip_device_merge.cpp)
add_rocprim_benchmark_hip(benchmark_hip_device_merge_sort.cpp)
Expand Down
8 changes: 4 additions & 4 deletions benchmark/benchmark_hc_block_discontinuity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ struct flag_heads
rp::block_store_direct_striped<BlockSize>(lid, d_output.data() + block_offset, input);
}
);
}
}
};

struct flag_tails
Expand All @@ -114,7 +114,7 @@ struct flag_tails
{
const unsigned int lid = idx.local[0];
const unsigned int block_offset = idx.tile[0] * ItemsPerThread * BlockSize;

T input[ItemsPerThread];
rp::block_load_direct_striped<BlockSize>(lid, d_input.data() + block_offset, input);

Expand Down Expand Up @@ -154,7 +154,7 @@ struct flag_heads_and_tails
bool WithTile,
unsigned int Trials
>
static void run(const hc::array<T> & d_input, const hc::array<T> & d_output,
static void run(const hc::array<T> & d_input, const hc::array<T> & d_output,
hc::accelerator_view acc_view, size_t size)
{
const size_t grid_size = size / ItemsPerThread;
Expand Down Expand Up @@ -304,7 +304,7 @@ int main(int argc, char *argv[])
benchmark::Initialize(&argc, argv);
const size_t size = parser.get<size_t>("size");
const int trials = parser.get<int>("trials");

// HC
hc::accelerator acc;
auto acc_view = acc.get_default_view();
Expand Down
28 changes: 21 additions & 7 deletions benchmark/benchmark_hip_block_discontinuity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;

namespace rp = rocprim;

template<
class Runner,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool WithTile,
unsigned int Trials
>
__global__
void kernel(const T * d_input, T * d_output)
{
Runner::template run<T, BlockSize, ItemsPerThread, WithTile, Trials>(d_input, d_output);
}

struct flag_heads
{
template<
Expand All @@ -65,8 +79,8 @@ struct flag_heads
bool WithTile,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand Down Expand Up @@ -108,8 +122,8 @@ struct flag_tails
bool WithTile,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand Down Expand Up @@ -151,8 +165,8 @@ struct flag_heads_and_tails
bool WithTile,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand Down Expand Up @@ -219,7 +233,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
auto start = std::chrono::high_resolution_clock::now();

hipLaunchKernelGGL(
HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, WithTile, Trials>),
HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, WithTile, Trials>),
dim3(size/items_per_block), dim3(BlockSize), 0, stream,
d_input, d_output
);
Expand Down
39 changes: 26 additions & 13 deletions benchmark/benchmark_hip_block_exchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;

namespace rp = rocprim;

template<
class Runner,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
void kernel(const T * d_input, T * d_output)
{
Runner::template run<T, BlockSize, ItemsPerThread, Trials>(d_input, d_output);
}

struct blocked_to_striped
{
template<
Expand All @@ -64,8 +77,8 @@ struct blocked_to_striped
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand All @@ -92,8 +105,8 @@ struct striped_to_blocked
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand All @@ -120,8 +133,8 @@ struct blocked_to_warp_striped
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand All @@ -148,8 +161,8 @@ struct warp_striped_to_blocked
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand All @@ -176,8 +189,8 @@ struct scatter_to_blocked
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand Down Expand Up @@ -206,8 +219,8 @@ struct scatter_to_striped
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T * d_input, T * d_output)
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
Expand Down Expand Up @@ -267,7 +280,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
auto start = std::chrono::high_resolution_clock::now();

hipLaunchKernelGGL(
HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, Trials>),
HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
dim3(size/items_per_block), dim3(BlockSize), 0, stream,
d_input, d_output
);
Expand Down
22 changes: 18 additions & 4 deletions benchmark/benchmark_hip_block_histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;

namespace rp = rocprim;

template<
class Runner,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int BinSize,
unsigned int Trials
>
__global__
void kernel(const T* input, T* output)
{
Runner::template run<T, BlockSize, ItemsPerThread, BinSize, Trials>(input, output);
}

template<rocprim::block_histogram_algorithm algorithm>
struct histogram
{
Expand All @@ -66,8 +80,8 @@ struct histogram
unsigned int BinSize,
unsigned int Trials
>
__global__
static void kernel(const T* input, T* output)
__device__
static void run(const T* input, T* output)
{
const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
unsigned int global_offset = hipBlockIdx_x * BinSize;
Expand Down Expand Up @@ -95,7 +109,7 @@ struct histogram
{
output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x];
global_offset += BlockSize;
}
}
}
}
};
Expand Down Expand Up @@ -133,7 +147,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
{
auto start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL(
HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, BinSize, Trials>),
HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, BinSize, Trials>),
dim3(size/items_per_block), dim3(BlockSize), 0, stream,
d_input, d_output
);
Expand Down
19 changes: 16 additions & 3 deletions benchmark/benchmark_hip_block_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;

namespace rp = rocprim;

template<
class Runner,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
void kernel(const T* input, T* output)
{
Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
}

template<rocprim::block_reduce_algorithm algorithm>
struct reduce
{
Expand All @@ -65,8 +78,8 @@ struct reduce
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T* input, T* output)
__device__
static void run(const T* input, T* output)
{
const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

Expand Down Expand Up @@ -125,7 +138,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
{
auto start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL(
HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, Trials>),
HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
dim3(size/items_per_block), dim3(BlockSize), 0, stream,
d_input, d_output
);
Expand Down
23 changes: 18 additions & 5 deletions benchmark/benchmark_hip_block_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;

namespace rp = rocprim;

template<
class Runner,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
void kernel(const T* input, T* output)
{
Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
}

template<rocprim::block_scan_algorithm algorithm>
struct inclusive_scan
{
Expand All @@ -65,8 +78,8 @@ struct inclusive_scan
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T* input, T* output)
__device__
static void run(const T* input, T* output)
{
const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

Expand Down Expand Up @@ -102,8 +115,8 @@ struct exclusive_scan
unsigned int ItemsPerThread,
unsigned int Trials
>
__global__
static void kernel(const T* input, T* output)
__device__
static void run(const T* input, T* output)
{
const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
using U = typename std::remove_reference<T>::type;
Expand Down Expand Up @@ -164,7 +177,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
{
auto start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL(
HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, Trials>),
HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
dim3(size/items_per_block), dim3(BlockSize), 0, stream,
d_input, d_output
);
Expand Down
Loading