ROCm · ex-rzr · Apr 30, 2019 · Feb 6, 2019 · Feb 7, 2019 · Feb 11, 2019
@@ -60,6 +60,7 @@ add_rocprim_benchmark_hip(benchmark_hip_block_radix_sort.cpp)
 add_rocprim_benchmark_hip(benchmark_hip_block_reduce.cpp)
 add_rocprim_benchmark_hip(benchmark_hip_block_scan.cpp)
 add_rocprim_benchmark_hip(benchmark_hip_block_sort.cpp)
+add_rocprim_benchmark_hip(benchmark_hip_device_binary_search.cpp)
 add_rocprim_benchmark_hip(benchmark_hip_device_histogram.cpp)
 add_rocprim_benchmark_hip(benchmark_hip_device_merge.cpp)
 add_rocprim_benchmark_hip(benchmark_hip_device_merge_sort.cpp)

@@ -91,7 +91,7 @@ struct flag_heads
                 rp::block_store_direct_striped<BlockSize>(lid, d_output.data() + block_offset, input);
             }
         );
-    } 
+    }
 };
 
 struct flag_tails
@@ -114,7 +114,7 @@ struct flag_tails
             {
                 const unsigned int lid = idx.local[0];
                 const unsigned int block_offset = idx.tile[0] * ItemsPerThread * BlockSize;
-                
+
                 T input[ItemsPerThread];
                 rp::block_load_direct_striped<BlockSize>(lid, d_input.data() + block_offset, input);
 
@@ -154,7 +154,7 @@ struct flag_heads_and_tails
         bool WithTile,
         unsigned int Trials
     >
-static void run(const hc::array<T> & d_input, const hc::array<T> & d_output,
+    static void run(const hc::array<T> & d_input, const hc::array<T> & d_output,
                     hc::accelerator_view acc_view, size_t size)
     {
         const size_t grid_size = size / ItemsPerThread;
@@ -304,7 +304,7 @@ int main(int argc, char *argv[])
     benchmark::Initialize(&argc, argv);
     const size_t size = parser.get<size_t>("size");
     const int trials = parser.get<int>("trials");
-    
+
     // HC
     hc::accelerator acc;
     auto acc_view = acc.get_default_view();

@@ -56,6 +56,20 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;
 
 namespace rp = rocprim;
 
+template<
+    class Runner,
+    class T,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    bool WithTile,
+    unsigned int Trials
+>
+__global__
+void kernel(const T * d_input, T * d_output)
+{
+    Runner::template run<T, BlockSize, ItemsPerThread, WithTile, Trials>(d_input, d_output);
+}
+
 struct flag_heads
 {
     template<
@@ -65,8 +79,8 @@ struct flag_heads
         bool WithTile,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -108,8 +122,8 @@ struct flag_tails
         bool WithTile,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -151,8 +165,8 @@ struct flag_heads_and_tails
         bool WithTile,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -219,7 +233,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         auto start = std::chrono::high_resolution_clock::now();
 
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, WithTile, Trials>),
+            HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, WithTile, Trials>),
             dim3(size/items_per_block), dim3(BlockSize), 0, stream,
             d_input, d_output
         );

@@ -56,6 +56,19 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;
 
 namespace rp = rocprim;
 
+template<
+    class Runner,
+    class T,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    unsigned int Trials
+>
+__global__
+void kernel(const T * d_input, T * d_output)
+{
+    Runner::template run<T, BlockSize, ItemsPerThread, Trials>(d_input, d_output);
+}
+
 struct blocked_to_striped
 {
     template<
@@ -64,8 +77,8 @@ struct blocked_to_striped
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -92,8 +105,8 @@ struct striped_to_blocked
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -120,8 +133,8 @@ struct blocked_to_warp_striped
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -148,8 +161,8 @@ struct warp_striped_to_blocked
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -176,8 +189,8 @@ struct scatter_to_blocked
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -206,8 +219,8 @@ struct scatter_to_striped
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T * d_input, T * d_output)
+    __device__
+    static void run(const T * d_input, T * d_output)
     {
         const unsigned int lid = hipThreadIdx_x;
         const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
@@ -267,7 +280,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
         auto start = std::chrono::high_resolution_clock::now();
 
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, Trials>),
+            HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
             dim3(size/items_per_block), dim3(BlockSize), 0, stream,
             d_input, d_output
         );

@@ -56,6 +56,20 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;
 
 namespace rp = rocprim;
 
+template<
+    class Runner,
+    class T,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    unsigned int BinSize,
+    unsigned int Trials
+>
+__global__
+void kernel(const T* input, T* output)
+{
+    Runner::template run<T, BlockSize, ItemsPerThread, BinSize, Trials>(input, output);
+}
+
 template<rocprim::block_histogram_algorithm algorithm>
 struct histogram
 {
@@ -66,8 +80,8 @@ struct histogram
         unsigned int BinSize,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T* input, T* output)
+    __device__
+    static void run(const T* input, T* output)
     {
         const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
         unsigned int global_offset = hipBlockIdx_x * BinSize;
@@ -95,7 +109,7 @@ struct histogram
             {
                 output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x];
                 global_offset += BlockSize;
-            }    
+            }
         }
     }
 };
@@ -133,7 +147,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     {
         auto start = std::chrono::high_resolution_clock::now();
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, BinSize, Trials>),
+            HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, BinSize, Trials>),
             dim3(size/items_per_block), dim3(BlockSize), 0, stream,
             d_input, d_output
         );

@@ -56,6 +56,19 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;
 
 namespace rp = rocprim;
 
+template<
+    class Runner,
+    class T,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    unsigned int Trials
+>
+__global__
+void kernel(const T* input, T* output)
+{
+    Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
+}
+
 template<rocprim::block_reduce_algorithm algorithm>
 struct reduce
 {
@@ -65,8 +78,8 @@ struct reduce
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T* input, T* output)
+    __device__
+    static void run(const T* input, T* output)
     {
         const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
@@ -125,7 +138,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     {
         auto start = std::chrono::high_resolution_clock::now();
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, Trials>),
+            HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
             dim3(size/items_per_block), dim3(BlockSize), 0, stream,
             d_input, d_output
         );

@@ -56,6 +56,19 @@ const size_t DEFAULT_N = 1024 * 1024 * 128;
 
 namespace rp = rocprim;
 
+template<
+    class Runner,
+    class T,
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    unsigned int Trials
+>
+__global__
+void kernel(const T* input, T* output)
+{
+    Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
+}
+
 template<rocprim::block_scan_algorithm algorithm>
 struct inclusive_scan
 {
@@ -65,8 +78,8 @@ struct inclusive_scan
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T* input, T* output)
+    __device__
+    static void run(const T* input, T* output)
     {
         const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
@@ -102,8 +115,8 @@ struct exclusive_scan
         unsigned int ItemsPerThread,
         unsigned int Trials
     >
-    __global__
-    static void kernel(const T* input, T* output)
+    __device__
+    static void run(const T* input, T* output)
     {
         const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
         using U = typename std::remove_reference<T>::type;
@@ -164,7 +177,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     {
         auto start = std::chrono::high_resolution_clock::now();
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(Benchmark::template kernel<T, BlockSize, ItemsPerThread, Trials>),
+            HIP_KERNEL_NAME(kernel<Benchmark, T, BlockSize, ItemsPerThread, Trials>),
             dim3(size/items_per_block), dim3(BlockSize), 0, stream,
             d_input, d_output
         );