Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e9b0748
add gfx906 as target
pramenku Sep 22, 2018
86133a7
Merge pull request #32 from pramenku/patch-1
VincentSC Sep 24, 2018
02d6006
Update name and email for the package maintainer
VincentSC Oct 18, 2018
d4415cf
Use HIP API shfl functions rather than hc::
aaronenyeshi Aug 24, 2018
5508ed2
Use HIP shfl functions for HIP-Clang
aaronenyeshi Aug 29, 2018
8b0f644
Bump rocPRIM version to 1.0.2.0
Aug 30, 2018
b8298b5
Ignore warnings about deprecated CUDA function(s) in HIP
Aug 30, 2018
c564b83
Improve device merge sort tests with custom types
ex-rzr Sep 24, 2018
5ced9c6
Update .gitlab-ci.yml
VincentSC Sep 26, 2018
38c7f26
Update .gitlab-ci.yml
VincentSC Sep 26, 2018
4610e0f
Update .gitlab-ci.yml
VincentSC Sep 26, 2018
b7773ea
Update .gitignore
VincentSC Sep 26, 2018
919b1bc
Add new file
VincentSC Sep 26, 2018
3f3afe1
Update .gitlab-ci-gputest.yml
VincentSC Sep 26, 2018
042eacc
Update .gitlab-ci.yml
VincentSC Sep 26, 2018
8368beb
Make sure the building is done on a server that has rocm installed
VincentSC Sep 26, 2018
ff41d04
Update .gitlab-ci.yml
VincentSC Sep 26, 2018
c11dda7
Fix missing initial value in BlockScan::ExclusiveSum
ex-rzr Sep 24, 2018
c41212f
Add gfx906 to test/extra (tests for install and package install)
ex-rzr Oct 2, 2018
d82ec52
Print AMDGPU_TARGETS in CMake Summary
ex-rzr Oct 2, 2018
078cb46
Add tests with half and char for block-level primitives
ex-rzr Oct 2, 2018
ac7d154
Disable loopback scan for half, add compile-time specialization choosing
ex-rzr Oct 3, 2018
f386dee
Add tests with half and char for device-level primitives
ex-rzr Oct 3, 2018
4344d4c
Replace reinterpret_cast with __builtin_memcpy
ex-rzr Oct 3, 2018
6af6cb6
Update tests for block_reduce
Oct 2, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

### Build dirs ###
build/

Expand Down Expand Up @@ -56,4 +55,7 @@ compile_commands.json
CTestTestfile.cmake
build

### Gtilab CI ###
.gitlab-ci-gputest.yml

# End of https://www.gitignore.io/api/c++,cmake
14 changes: 14 additions & 0 deletions .gitlab-ci-gputest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
test:rocm241:
extends: .unittest
tags:
- tag241

test:rocm243:
extends: .unittest
tags:
- tag243

test:rocm244:
extends: .unittest
tags:
- tag244
22 changes: 8 additions & 14 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ before_script:

build:rocm:
stage: build
tags:
- rocm
variables:
SUDO_CMD: "sudo -E"
script:
Expand All @@ -71,9 +73,7 @@ build:rocm:
- build/rocprim*.zip
expire_in: 2 weeks

test:rocm243:
tags:
- tag243
.unittest:
stage: test
variables:
SUDO_CMD: "sudo -E"
Expand All @@ -83,20 +83,12 @@ test:rocm243:
- cd build
- $SUDO_CMD ctest --output-on-failure --repeat-until-fail 2

test:rocm244:
tags:
- tag244
stage: test
variables:
SUDO_CMD: "sudo -E"
dependencies:
- build:rocm
script:
- cd build
- $SUDO_CMD ctest --output-on-failure --repeat-until-fail 2
include: '.gitlab-ci-gputest.yml'

test:rocm_package:
stage: test
tags:
- rocm
variables:
SUDO_CMD: "sudo -E"
dependencies:
Expand All @@ -112,6 +104,8 @@ test:rocm_package:

test:rocm_install:
stage: test
tags:
- rocm
variables:
SUDO_CMD: "sudo -E"
script:
Expand Down
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ cmake_minimum_required(VERSION 3.5.1 FATAL_ERROR)
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories")

# rocPRIM project
project(rocprim VERSION 1.0.1.0 LANGUAGES CXX)
project(rocprim VERSION 1.0.2.0 LANGUAGES CXX)

# CMake modules
list(APPEND CMAKE_MODULE_PATH
Expand Down Expand Up @@ -62,13 +62,13 @@ option(ONLY_INSTALL "Only install" OFF)
# Get dependencies
include(cmake/Dependencies.cmake)

# AMD targets
set(AMDGPU_TARGETS gfx803;gfx900;gfx906 CACHE STRING "List of specific machine types for library to target")

# Print configuration summary
include(cmake/Summary.cmake)
print_configuration_summary()

# AMD targets
set(AMDGPU_TARGETS gfx803;gfx900 CACHE STRING "List of specific machine types for library to target")

# rocPRIM works only on hcc
if(HIP_PLATFORM STREQUAL "hcc")
# rocPRIM library
Expand Down Expand Up @@ -118,12 +118,12 @@ if(HIP_PLATFORM STREQUAL "hcc")
rocm_create_package(
NAME rocprim
DESCRIPTION "Radeon Open Compute Parallel Primitives Libary"
MAINTAINER "Jakub Szuppe <jakub@streamhpc.com>"
MAINTAINER "Stream HPC Maintainers <maintainer@streamhpc.com>"
)
else()
rocm_create_package(
NAME rocprim-hipcub
DESCRIPTION "Radeon Open Compute Parallel Primitives Libary (hipCUB only)"
MAINTAINER "Jakub Szuppe <jakub@streamhpc.com>"
MAINTAINER "Stream HPC Maintainers <maintainer@streamhpc.com>"
)
endif()
3 changes: 2 additions & 1 deletion cmake/SetupNVCC.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,5 @@ string(REPLACE " " ";" HIP_CPP_CONFIG_FLAGS ${HIP_CPP_CONFIG_FLAGS})
list(APPEND CUDA_NVCC_FLAGS "-std=c++11 ${HIP_CPP_CONFIG_FLAGS} ${HIP_NVCC_FLAGS}")

# Ignore warnings about #pragma unroll
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
# and about deprecated CUDA function(s) used in hip/nvcc_detail/hip_runtime_api.h
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas -Wno-deprecated-declarations")
1 change: 1 addition & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ function(print_configuration_summary)
message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}")
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}")
message(STATUS " Device targets : ${AMDGPU_TARGETS}")
message(STATUS "")
message(STATUS " BUILD_TEST : ${BUILD_TEST}")
message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}")
Expand Down
8 changes: 4 additions & 4 deletions hipcub/include/hipcub/rocprim/block/block_scan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,13 @@ class BlockScan
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output)
{
base_type::exclusive_scan(input, output, temp_storage_);
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}

HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output, T& block_aggregate)
{
base_type::exclusive_scan(input, output, block_aggregate, temp_storage_);
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}

template<typename BlockPrefixCallbackOp>
Expand All @@ -212,15 +212,15 @@ class BlockScan
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
{
base_type::exclusive_scan(input, output, temp_storage_);
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}

template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T& block_aggregate)
{
base_type::exclusive_scan(input, output, block_aggregate, temp_storage_);
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}

template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
Expand Down
10 changes: 5 additions & 5 deletions rocprim/include/rocprim/device/detail/device_merge_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,11 +359,11 @@ void block_sort_kernel_impl(KeysInputIterator keys_input,
);

block_sort_impl<with_values, BlockSize>(
key[0],
value[0],
valid_in_last_block,
last_block,
compare_function
key[0],
value[0],
valid_in_last_block,
last_block,
compare_function
);

block_store_impl<with_values, BlockSize>(
Expand Down
4 changes: 2 additions & 2 deletions rocprim/include/rocprim/device/device_merge_sort_hc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ void merge_sort_impl(void * temporary_storage,
ROCPRIM_DETAIL_HC_SYNC("block_sort_kernel", size, start)

bool temporary_store = false;
for(unsigned int block = block_size ; block < size; block *= 2)
for(unsigned int block = block_size; block < size; block *= 2)
{
temporary_store = !temporary_store;
if(temporary_store)
Expand All @@ -149,7 +149,7 @@ void merge_sort_impl(void * temporary_storage,
);
}
);
ROCPRIM_DETAIL_HC_SYNC("block_merge_buffer_kernel", size, start)
ROCPRIM_DETAIL_HC_SYNC("block_merge_kernel", size, start)
}
else
{
Expand Down
10 changes: 5 additions & 5 deletions rocprim/include/rocprim/device/device_merge_sort_hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,33 +176,33 @@ hipError_t merge_sort_impl(void * temporary_storage,

const unsigned int grid_size = number_of_blocks;
hipLaunchKernelGGL(
HIP_KERNEL_NAME(detail::block_sort_kernel<block_size>),
HIP_KERNEL_NAME(block_sort_kernel<block_size>),
dim3(grid_size), dim3(block_size), 0, stream,
keys_input, keys_output, values_input, values_output,
size, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_sort_kernel", size, start);

bool temporary_store = false;
for(unsigned int block = block_size ; block < size; block *= 2)
for(unsigned int block = block_size; block < size; block *= 2)
{
temporary_store = !temporary_store;
if(temporary_store)
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL(
HIP_KERNEL_NAME(detail::block_merge_kernel),
HIP_KERNEL_NAME(block_merge_kernel),
dim3(grid_size), dim3(block_size), 0, stream,
keys_output, keys_buffer, values_output, values_buffer,
size, block, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_buffer_kernel", size, start);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start);
}
else
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL(
HIP_KERNEL_NAME(detail::block_merge_kernel),
HIP_KERNEL_NAME(block_merge_kernel),
dim3(grid_size), dim3(block_size), 0, stream,
keys_buffer, keys_output, values_buffer, values_output,
size, block, compare_function
Expand Down
75 changes: 30 additions & 45 deletions rocprim/include/rocprim/device/device_scan_hc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ namespace detail

template<
bool Exclusive,
bool UseLoopback,
class Config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
inline
void scan_impl(void * temporary_storage,
auto scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
Expand All @@ -72,6 +73,7 @@ void scan_impl(void * temporary_storage,
BinaryFunction scan_op,
hc::accelerator_view acc_view,
const bool debug_synchronous)
-> typename std::enable_if<!UseLoopback>::type
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using output_type = typename std::iterator_traits<OutputIterator>::value_type;
Expand Down Expand Up @@ -140,7 +142,7 @@ void scan_impl(void * temporary_storage,
auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type));

if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
scan_impl<false, config>(
scan_impl<false, false, config>(
nested_temp_storage,
nested_temp_storage_size,
block_prefixes, // input
Expand Down Expand Up @@ -188,22 +190,24 @@ void scan_impl(void * temporary_storage,

template<
bool Exclusive,
bool UseLoopback,
class Config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
inline
void lookback_scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction scan_op,
hc::accelerator_view acc_view,
const bool debug_synchronous)
auto scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction scan_op,
hc::accelerator_view acc_view,
const bool debug_synchronous)
-> typename std::enable_if<UseLoopback>::type
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using output_type = typename std::iterator_traits<OutputIterator>::value_type;
Expand Down Expand Up @@ -404,24 +408,14 @@ void inclusive_scan(void * temporary_storage,
>::type;

// Lookback scan has problems with types that are not arithmetic
if(::rocprim::is_arithmetic<result_type>::value)
{
return detail::lookback_scan_impl<false, Config>(
temporary_storage, storage_size,
// result_type() is a dummy initial value (not used)
input, output, result_type(), size,
scan_op, acc_view, debug_synchronous
);
}
else
{
return detail::scan_impl<false, Config>(
temporary_storage, storage_size,
// result_type() is a dummy initial value (not used)
input, output, result_type(), size,
scan_op, acc_view, debug_synchronous
);
}
// TODO: Investigate why the compiler never finishes linking if half is used
// Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic
detail::scan_impl<false, std::is_arithmetic<result_type>::value, Config>(
temporary_storage, storage_size,
// result_type() is a dummy initial value (not used)
input, output, result_type(), size,
scan_op, acc_view, debug_synchronous
);
}

/// \brief HC parallel exclusive scan primitive for device level.
Expand Down Expand Up @@ -533,22 +527,13 @@ void exclusive_scan(void * temporary_storage,
>::type;

// Lookback scan has problems with types that are not arithmetic
if(::rocprim::is_arithmetic<result_type>::value)
{
return detail::lookback_scan_impl<true, Config>(
temporary_storage, storage_size,
input, output, initial_value, size,
scan_op, acc_view, debug_synchronous
);
}
else
{
return detail::scan_impl<true, Config>(
temporary_storage, storage_size,
input, output, initial_value, size,
scan_op, acc_view, debug_synchronous
);
}
// TODO: Investigate why the compiler never finishes linking if half is used
// Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic
detail::scan_impl<true, std::is_arithmetic<result_type>::value, Config>(
temporary_storage, storage_size,
input, output, initial_value, size,
scan_op, acc_view, debug_synchronous
);
}

/// @}
Expand Down
Loading