Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

### Build dirs ###
build/

Expand Down Expand Up @@ -56,4 +55,7 @@ compile_commands.json
CTestTestfile.cmake
build

### Gtilab CI ###
.gitlab-ci-gputest.yml

# End of https://www.gitignore.io/api/c++,cmake
14 changes: 14 additions & 0 deletions .gitlab-ci-gputest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
test:rocm241:
extends: .unittest
tags:
- tag241

test:rocm243:
extends: .unittest
tags:
- tag243

test:rocm244:
extends: .unittest
tags:
- tag244
22 changes: 8 additions & 14 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ before_script:

build:rocm:
stage: build
tags:
- rocm
variables:
SUDO_CMD: "sudo -E"
script:
Expand All @@ -71,9 +73,7 @@ build:rocm:
- build/rocprim*.zip
expire_in: 2 weeks

test:rocm243:
tags:
- tag243
.unittest:
stage: test
variables:
SUDO_CMD: "sudo -E"
Expand All @@ -83,20 +83,12 @@ test:rocm243:
- cd build
- $SUDO_CMD ctest --output-on-failure --repeat-until-fail 2

test:rocm244:
tags:
- tag244
stage: test
variables:
SUDO_CMD: "sudo -E"
dependencies:
- build:rocm
script:
- cd build
- $SUDO_CMD ctest --output-on-failure --repeat-until-fail 2
include: '.gitlab-ci-gputest.yml'

test:rocm_package:
stage: test
tags:
- rocm
variables:
SUDO_CMD: "sudo -E"
dependencies:
Expand All @@ -112,6 +104,8 @@ test:rocm_package:

test:rocm_install:
stage: test
tags:
- rocm
variables:
SUDO_CMD: "sudo -E"
script:
Expand Down
17 changes: 12 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ cmake_minimum_required(VERSION 3.5.1 FATAL_ERROR)
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories")

# rocPRIM project
project(rocprim VERSION 1.0.1.0 LANGUAGES CXX)
project(rocprim VERSION 1.0.2.0 LANGUAGES CXX)

# CMake modules
list(APPEND CMAKE_MODULE_PATH
Expand All @@ -46,11 +46,18 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Add paths to linker se
# Verify that hcc compiler is used on ROCM platform
include(cmake/VerifyCompiler.cmake)

# Build option to disable -Werror
option(DISABLE_WERROR "Disable building with Werror" OFF)

# Set CXX flags
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
if(DISABLE_WERROR)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
endif()

# Build options
option(BUILD_TEST "Build tests (requires googletest)" ON)
Expand All @@ -62,13 +69,13 @@ option(ONLY_INSTALL "Only install" OFF)
# Get dependencies
include(cmake/Dependencies.cmake)

# AMD targets
set(AMDGPU_TARGETS gfx803;gfx900;gfx906 CACHE STRING "List of specific machine types for library to target")

# Print configuration summary
include(cmake/Summary.cmake)
print_configuration_summary()

# AMD targets
set(AMDGPU_TARGETS gfx803;gfx900;gfx906 CACHE STRING "List of specific machine types for library to target")

# rocPRIM works only on hcc
if(HIP_PLATFORM STREQUAL "hcc")
# rocPRIM library
Expand Down
3 changes: 2 additions & 1 deletion cmake/SetupNVCC.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,5 @@ string(REPLACE " " ";" HIP_CPP_CONFIG_FLAGS ${HIP_CPP_CONFIG_FLAGS})
list(APPEND CUDA_NVCC_FLAGS "-std=c++11 ${HIP_CPP_CONFIG_FLAGS} ${HIP_NVCC_FLAGS}")

# Ignore warnings about #pragma unroll
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
# and about deprecated CUDA function(s) used in hip/nvcc_detail/hip_runtime_api.h
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas -Wno-deprecated-declarations")
1 change: 1 addition & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ function(print_configuration_summary)
message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}")
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}")
message(STATUS " Device targets : ${AMDGPU_TARGETS}")
message(STATUS "")
message(STATUS " BUILD_TEST : ${BUILD_TEST}")
message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}")
Expand Down
8 changes: 4 additions & 4 deletions hipcub/include/hipcub/rocprim/block/block_scan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,13 @@ class BlockScan
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output)
{
base_type::exclusive_scan(input, output, temp_storage_);
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}

HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output, T& block_aggregate)
{
base_type::exclusive_scan(input, output, block_aggregate, temp_storage_);
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}

template<typename BlockPrefixCallbackOp>
Expand All @@ -212,15 +212,15 @@ class BlockScan
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
{
base_type::exclusive_scan(input, output, temp_storage_);
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}

template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T& block_aggregate)
{
base_type::exclusive_scan(input, output, block_aggregate, temp_storage_);
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}

template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
Expand Down
10 changes: 5 additions & 5 deletions rocprim/include/rocprim/device/detail/device_merge_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,11 +359,11 @@ void block_sort_kernel_impl(KeysInputIterator keys_input,
);

block_sort_impl<with_values, BlockSize>(
key[0],
value[0],
valid_in_last_block,
last_block,
compare_function
key[0],
value[0],
valid_in_last_block,
last_block,
compare_function
);

block_store_impl<with_values, BlockSize>(
Expand Down
4 changes: 2 additions & 2 deletions rocprim/include/rocprim/device/device_merge_sort_hc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ void merge_sort_impl(void * temporary_storage,
ROCPRIM_DETAIL_HC_SYNC("block_sort_kernel", size, start)

bool temporary_store = false;
for(unsigned int block = block_size ; block < size; block *= 2)
for(unsigned int block = block_size; block < size; block *= 2)
{
temporary_store = !temporary_store;
if(temporary_store)
Expand All @@ -149,7 +149,7 @@ void merge_sort_impl(void * temporary_storage,
);
}
);
ROCPRIM_DETAIL_HC_SYNC("block_merge_buffer_kernel", size, start)
ROCPRIM_DETAIL_HC_SYNC("block_merge_kernel", size, start)
}
else
{
Expand Down
10 changes: 5 additions & 5 deletions rocprim/include/rocprim/device/device_merge_sort_hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,33 +176,33 @@ hipError_t merge_sort_impl(void * temporary_storage,

const unsigned int grid_size = number_of_blocks;
hipLaunchKernelGGL(
HIP_KERNEL_NAME(detail::block_sort_kernel<block_size>),
HIP_KERNEL_NAME(block_sort_kernel<block_size>),
dim3(grid_size), dim3(block_size), 0, stream,
keys_input, keys_output, values_input, values_output,
size, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_sort_kernel", size, start);

bool temporary_store = false;
for(unsigned int block = block_size ; block < size; block *= 2)
for(unsigned int block = block_size; block < size; block *= 2)
{
temporary_store = !temporary_store;
if(temporary_store)
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL(
HIP_KERNEL_NAME(detail::block_merge_kernel),
HIP_KERNEL_NAME(block_merge_kernel),
dim3(grid_size), dim3(block_size), 0, stream,
keys_output, keys_buffer, values_output, values_buffer,
size, block, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_buffer_kernel", size, start);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start);
}
else
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL(
HIP_KERNEL_NAME(detail::block_merge_kernel),
HIP_KERNEL_NAME(block_merge_kernel),
dim3(grid_size), dim3(block_size), 0, stream,
keys_buffer, keys_output, values_buffer, values_output,
size, block, compare_function
Expand Down
75 changes: 30 additions & 45 deletions rocprim/include/rocprim/device/device_scan_hc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ namespace detail

template<
bool Exclusive,
bool UseLoopback,
class Config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
inline
void scan_impl(void * temporary_storage,
auto scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
Expand All @@ -72,6 +73,7 @@ void scan_impl(void * temporary_storage,
BinaryFunction scan_op,
hc::accelerator_view acc_view,
const bool debug_synchronous)
-> typename std::enable_if<!UseLoopback>::type
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using output_type = typename std::iterator_traits<OutputIterator>::value_type;
Expand Down Expand Up @@ -140,7 +142,7 @@ void scan_impl(void * temporary_storage,
auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type));

if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
scan_impl<false, config>(
scan_impl<false, false, config>(
nested_temp_storage,
nested_temp_storage_size,
block_prefixes, // input
Expand Down Expand Up @@ -188,22 +190,24 @@ void scan_impl(void * temporary_storage,

template<
bool Exclusive,
bool UseLoopback,
class Config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
inline
void lookback_scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction scan_op,
hc::accelerator_view acc_view,
const bool debug_synchronous)
auto scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction scan_op,
hc::accelerator_view acc_view,
const bool debug_synchronous)
-> typename std::enable_if<UseLoopback>::type
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using output_type = typename std::iterator_traits<OutputIterator>::value_type;
Expand Down Expand Up @@ -404,24 +408,14 @@ void inclusive_scan(void * temporary_storage,
>::type;

// Lookback scan has problems with types that are not arithmetic
if(::rocprim::is_arithmetic<result_type>::value)
{
return detail::lookback_scan_impl<false, Config>(
temporary_storage, storage_size,
// result_type() is a dummy initial value (not used)
input, output, result_type(), size,
scan_op, acc_view, debug_synchronous
);
}
else
{
return detail::scan_impl<false, Config>(
temporary_storage, storage_size,
// result_type() is a dummy initial value (not used)
input, output, result_type(), size,
scan_op, acc_view, debug_synchronous
);
}
// TODO: Investigate why the compiler never finishes linking if half is used
// Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic
detail::scan_impl<false, std::is_arithmetic<result_type>::value, Config>(
temporary_storage, storage_size,
// result_type() is a dummy initial value (not used)
input, output, result_type(), size,
scan_op, acc_view, debug_synchronous
);
}

/// \brief HC parallel exclusive scan primitive for device level.
Expand Down Expand Up @@ -533,22 +527,13 @@ void exclusive_scan(void * temporary_storage,
>::type;

// Lookback scan has problems with types that are not arithmetic
if(::rocprim::is_arithmetic<result_type>::value)
{
return detail::lookback_scan_impl<true, Config>(
temporary_storage, storage_size,
input, output, initial_value, size,
scan_op, acc_view, debug_synchronous
);
}
else
{
return detail::scan_impl<true, Config>(
temporary_storage, storage_size,
input, output, initial_value, size,
scan_op, acc_view, debug_synchronous
);
}
// TODO: Investigate why the compiler never finishes linking if half is used
// Workaround: rocprim::is_arithmetic is replaced by std::is_arithmetic
detail::scan_impl<true, std::is_arithmetic<result_type>::value, Config>(
temporary_storage, storage_size,
input, output, initial_value, size,
scan_op, acc_view, debug_synchronous
);
}

/// @}
Expand Down
Loading