From 872a1ca790f8a48265579f5314382f434098c857 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Tue, 21 Aug 2018 19:04:38 +0000 Subject: [PATCH] Initial changes for HIP-Clang Compiler These cmake changes, function changes, test changes, attribute changes and more are needed for the hip-clang compiler to be used with rocPRIM. --- CMakeLists.txt | 15 +++++++ benchmark/CMakeLists.txt | 2 + cmake/VerifyCompiler.cmake | 6 +-- hipcub/include/hipcub/rocprim/util_ptx.hpp | 6 +-- rocprim/CMakeLists.txt | 8 ++++ .../rocprim/device/device_scan_by_key_hip.hpp | 3 +- .../device/device_segmented_scan_hip.hpp | 3 +- rocprim/include/rocprim/intrinsics/atomic.hpp | 10 ++--- rocprim/include/rocprim/intrinsics/thread.hpp | 9 ++-- rocprim/include/rocprim/intrinsics/warp.hpp | 9 ++-- .../rocprim/intrinsics/warp_shuffle.hpp | 8 ++-- .../include/rocprim/iterator/zip_iterator.hpp | 2 + rocprim/include/rocprim/types.hpp | 4 +- rocprim/include/rocprim/types/tuple.hpp | 2 +- .../warp/detail/warp_segment_bounds.hpp | 2 +- test/CMakeLists.txt | 23 +++++++--- test/rocprim/CMakeLists.txt | 3 +- test/rocprim/test_hip_block_exchange.cpp | 2 - test/rocprim/test_hip_block_load_store.cpp | 44 +++++++++---------- test/rocprim/test_hip_transform_iterator.cpp | 2 + test/rocprim/test_hip_warp_reduce.cpp | 12 ++--- test/rocprim/test_hip_warp_sort.cpp | 2 +- 22 files changed, 107 insertions(+), 70 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 126fc7099..0447a0471 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,21 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") +# Determine if CXX Compiler is hcc, hip-clang or other +execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--version" OUTPUT_VARIABLE CXX_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_STRIP_TRAILING_WHITESPACE) +string(REGEX MATCH "[A-Za-z]* ?clang version" TMP_CXX_VERSION ${CXX_OUTPUT}) +string(REGEX MATCH "[A-Za-z]+" CXX_VERSION_STRING ${TMP_CXX_VERSION}) + if(CXX_VERSION_STRING MATCHES "HCC") + set(HIP_COMPILER "hcc" CACHE STRING "HIP Compiler") +elseif(CXX_VERSION_STRING MATCHES "clang") + set(HIP_COMPILER "clang" CACHE STRING "HIP Compiler") +else() + message(FATAL_ERROR "CXX Compiler version ${CXX_VERSION_STRING} unsupported.") +endif() +message(STATUS "HIP Compiler: " ${HIP_COMPILER}) + # Build options option(BUILD_TEST "Build tests (requires googletest)" ON) option(BUILD_BENCHMARK "Build benchmarks" OFF) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index a3745417b..071bf94cd 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -79,6 +79,7 @@ add_rocprim_benchmark_hip(benchmark_hip_warp_sort.cpp) add_rocprim_benchmark_hip(benchmark_hip_device_memory.cpp) # rocPRIM HC benchmarks +if(HIP_COMPILER STREQUAL "HCC") add_rocprim_benchmark_hc(benchmark_hc_block_discontinuity.cpp) add_rocprim_benchmark_hc(benchmark_hc_block_exchange.cpp) add_rocprim_benchmark_hc(benchmark_hc_block_histogram.cpp) @@ -95,3 +96,4 @@ add_rocprim_benchmark_hc(benchmark_hc_device_select.cpp) add_rocprim_benchmark_hc(benchmark_hc_device_transform.cpp) add_rocprim_benchmark_hc(benchmark_hc_warp_scan.cpp) add_rocprim_benchmark_hc(benchmark_hc_warp_sort.cpp) +endif() diff --git a/cmake/VerifyCompiler.cmake b/cmake/VerifyCompiler.cmake index b8b10be3e..ea81a8fee 100644 --- a/cmake/VerifyCompiler.cmake +++ b/cmake/VerifyCompiler.cmake @@ -32,8 +32,8 @@ if(HIP_PLATFORM STREQUAL "nvcc") include(cmake/SetupNVCC.cmake) message(STATUS "rocPRIM does not support NVCC. Only hipCUB will be available.") elseif(HIP_PLATFORM STREQUAL "hcc") - if(NOT (CMAKE_CXX_COMPILER MATCHES ".*/hcc$")) - message(FATAL_ERROR "On ROCm platform 'hcc' must be used as C++ compiler.") + if(NOT (CMAKE_CXX_COMPILER MATCHES ".*/hcc$" OR CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")) + message(FATAL_ERROR "On ROCm platform 'hcc' or 'clang' must be used as C++ compiler.") else() # Workaround until hcc & hip cmake modules fixes symlink logic in their config files. # (Thanks to rocBLAS devs for finding workaround for this problem.) @@ -44,5 +44,5 @@ elseif(HIP_PLATFORM STREQUAL "hcc") find_package(hip REQUIRED CONFIG PATHS /opt/rocm) endif() else() - message(FATAL_ERROR "HIP_PLATFORM must be 'hcc' (AMD ROCm platform) or `nvcc` (NVIDIA CUDA platform).") + message(FATAL_ERROR "HIP_PLATFORM must be 'hcc' or 'clang' (AMD ROCm platform) or `nvcc` (NVIDIA CUDA platform).") endif() diff --git a/hipcub/include/hipcub/rocprim/util_ptx.hpp b/hipcub/include/hipcub/rocprim/util_ptx.hpp index a3d925edc..68be62f76 100644 --- a/hipcub/include/hipcub/rocprim/util_ptx.hpp +++ b/hipcub/include/hipcub/rocprim/util_ptx.hpp @@ -183,7 +183,7 @@ auto unsigned_bit_extract(UnsignedBits source, -> typename std::enable_if::type { #ifdef __HIP_PLATFORM_HCC__ - return ::hc::__bitextract_u64(source, bit_start, num_bits); + return __bitextract_u64(source, bit_start, num_bits); #else return (source << (64 - bit_start - num_bits)) >> (64 - num_bits); #endif // __HIP_PLATFORM_HCC__ @@ -197,7 +197,7 @@ auto unsigned_bit_extract(UnsignedBits source, -> typename std::enable_if::type { #ifdef __HIP_PLATFORM_HCC__ - return ::hc::__bitextract_u32(source, bit_start, num_bits); + return __bitextract_u32(source, bit_start, num_bits); #else return (static_cast(source) << (32 - bit_start - num_bits)) >> (32 - num_bits); #endif // __HIP_PLATFORM_HCC__ @@ -228,7 +228,7 @@ void BFI(unsigned int &ret, unsigned int num_bits) { #ifdef __HIP_PLATFORM_HCC__ - ret = ::hc::__bitinsert_u32(x, y, bit_start, num_bits); + ret = __bitinsert_u32(x, y, bit_start, num_bits); #else x <<= bit_start; unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; diff --git a/rocprim/CMakeLists.txt b/rocprim/CMakeLists.txt index 0aae2ef9b..671b27e0e 100644 --- a/rocprim/CMakeLists.txt +++ b/rocprim/CMakeLists.txt @@ -42,12 +42,20 @@ target_include_directories(rocprim # This target allows using only HC interface, links only # against HC/HSA library, doesn't require HIP add_library(rocprim_hc INTERFACE) +if(HIP_COMPILER STREQUAL "HCC") target_link_libraries(rocprim_hc INTERFACE rocprim hcc::hccrt hcc::hc_am ) +elseif(HIP_COMPILER STREQUAL "clang") +target_link_libraries(rocprim_hc + INTERFACE + rocprim +) +endif() + target_compile_definitions(rocprim_hc INTERFACE ROCPRIM_HC_API=1 diff --git a/rocprim/include/rocprim/device/device_scan_by_key_hip.hpp b/rocprim/include/rocprim/device/device_scan_by_key_hip.hpp index 783fb8207..6b2ae7dec 100644 --- a/rocprim/include/rocprim/device/device_scan_by_key_hip.hpp +++ b/rocprim/include/rocprim/device/device_scan_by_key_hip.hpp @@ -304,7 +304,8 @@ hipError_t exclusive_scan_by_key(void * temporary_storage, ) ) ), - [initial_value, key_compare_op](const ::rocprim::tuple& t) + [initial_value, key_compare_op] ROCPRIM_HOST_DEVICE + (const ::rocprim::tuple& t) -> ::rocprim::tuple { if(!key_compare_op(::rocprim::get<1>(t), ::rocprim::get<2>(t))) diff --git a/rocprim/include/rocprim/device/device_segmented_scan_hip.hpp b/rocprim/include/rocprim/device/device_segmented_scan_hip.hpp index 4e6eb6ebc..794cc1b95 100644 --- a/rocprim/include/rocprim/device/device_segmented_scan_hip.hpp +++ b/rocprim/include/rocprim/device/device_segmented_scan_hip.hpp @@ -617,7 +617,8 @@ hipError_t segmented_exclusive_scan(void * temporary_storage, head_flags ) ), - [initial_value](const ::rocprim::tuple& t) + [initial_value] ROCPRIM_HOST_DEVICE + (const ::rocprim::tuple& t) -> ::rocprim::tuple { if(::rocprim::get<1>(t)) diff --git a/rocprim/include/rocprim/intrinsics/atomic.hpp b/rocprim/include/rocprim/intrinsics/atomic.hpp index 0e373531e..1a8f2e17f 100644 --- a/rocprim/include/rocprim/intrinsics/atomic.hpp +++ b/rocprim/include/rocprim/intrinsics/atomic.hpp @@ -33,7 +33,7 @@ namespace detail #ifdef ROCPRIM_HC_API return hc::atomic_fetch_add(address, value); #else - return ::atomicAdd(address, value); + return atomicAdd(address, value); #endif } @@ -43,7 +43,7 @@ namespace detail #ifdef ROCPRIM_HC_API return hc::atomic_fetch_add(address, value); #else - return ::atomicAdd(address, value); + return atomicAdd(address, value); #endif } @@ -53,7 +53,7 @@ namespace detail #ifdef ROCPRIM_HC_API return hc::atomic_fetch_add(address, value); #else - return ::atomicAdd(address, value); + return atomicAdd(address, value); #endif } @@ -63,7 +63,7 @@ namespace detail #ifdef ROCPRIM_HC_API return hc::atomic_fetch_add(reinterpret_cast(address), static_cast(value)); #else - return ::atomicAdd(address, value); + return atomicAdd(address, value); #endif } @@ -73,7 +73,7 @@ namespace detail #ifdef ROCPRIM_HC_API return hc::__atomic_wrapinc(address, value); #else - return ::atomicInc(address, value); + return atomicInc(address, value); #endif } } diff --git a/rocprim/include/rocprim/intrinsics/thread.hpp b/rocprim/include/rocprim/intrinsics/thread.hpp index 83bc1dec3..582cbd79e 100644 --- a/rocprim/include/rocprim/intrinsics/thread.hpp +++ b/rocprim/include/rocprim/intrinsics/thread.hpp @@ -75,8 +75,7 @@ unsigned int lane_id() #ifdef ROCPRIM_HC_API return hc::__lane_id(); #else // HIP - // TODO: Find HIP function for that - return hc::__lane_id(); + return __lane_id(); #endif } @@ -262,19 +261,19 @@ namespace detail ROCPRIM_DEVICE inline void memory_fence_system(void) { - ::__threadfence_system(); + __threadfence_system(); } ROCPRIM_DEVICE inline void memory_fence_block(void) { - ::__threadfence_block(); + __threadfence_block(); } ROCPRIM_DEVICE inline void memory_fence_device(void) { - ::__threadfence(); + __threadfence(); } #else // __threadfence_system() diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp index f72b4e54e..f67eecfab 100644 --- a/rocprim/include/rocprim/intrinsics/warp.hpp +++ b/rocprim/include/rocprim/intrinsics/warp.hpp @@ -56,10 +56,9 @@ unsigned int masked_bit_count(unsigned long long x, unsigned int add = 0) c = hc::__amdgcn_mbcnt_hi(static_cast(x >> 32), c); return c; #else // HIP - // TODO: Use HIP function(s) int c; - c = hc::__amdgcn_mbcnt_lo(static_cast(x), add); - c = hc::__amdgcn_mbcnt_hi(static_cast(x >> 32), c); + c = __mbcnt_lo(static_cast(x), add); + c = __mbcnt_hi(static_cast(x >> 32), c); return c; #endif } @@ -73,7 +72,7 @@ int warp_any(int predicate) #ifdef ROCPRIM_HC_API return hc::__any(predicate); #else // ROCPRIM_HIP_API - return ::__any(predicate); + return __any(predicate); #endif } @@ -83,7 +82,7 @@ int warp_all(int predicate) #ifdef ROCPRIM_HC_API return hc::__all(predicate); #else // ROCPRIM_HIP_API - return ::__all(predicate); + return __all(predicate); #endif } diff --git a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp index a9afb9e36..89bf84a7e 100644 --- a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp +++ b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp @@ -102,7 +102,7 @@ T warp_shuffle(T input, const int src_lane, const int width = warp_size()) input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) return hc::__shfl(v, src_lane, width); #else return __shfl(v, src_lane, width); @@ -131,7 +131,7 @@ T warp_shuffle_up(T input, const unsigned int delta, const int width = warp_size input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) return hc::__shfl_up(v, delta, width); #else return __shfl_up(v, delta, width); @@ -160,7 +160,7 @@ T warp_shuffle_down(T input, const unsigned int delta, const int width = warp_si input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) return hc::__shfl_down(v, delta, width); #else return __shfl_down(v, delta, width); @@ -188,7 +188,7 @@ T warp_shuffle_xor(T input, const int lane_mask, const int width = warp_size()) input, [=](int v) -> int { - #if defined(ROCPRIM_HC_API) || defined(__HIP_PLATFORM_HCC__) + #if defined(ROCPRIM_HC_API) return hc::__shfl_xor(v, lane_mask, width); #else return __shfl_xor(v, lane_mask, width); diff --git a/rocprim/include/rocprim/iterator/zip_iterator.hpp b/rocprim/include/rocprim/iterator/zip_iterator.hpp index 7239201ad..fe2b3e0fa 100644 --- a/rocprim/include/rocprim/iterator/zip_iterator.hpp +++ b/rocprim/include/rocprim/iterator/zip_iterator.hpp @@ -92,12 +92,14 @@ struct decrement_iterator template struct advance_iterator { + ROCPRIM_HOST_DEVICE inline advance_iterator(Difference distance) : distance_(distance) { } template + ROCPRIM_HOST_DEVICE inline void operator()(Iterator& it) { using it_distance_type = typename std::iterator_traits::difference_type; diff --git a/rocprim/include/rocprim/types.hpp b/rocprim/include/rocprim/types.hpp index 15bfa7f56..fb98fa0ee 100644 --- a/rocprim/include/rocprim/types.hpp +++ b/rocprim/include/rocprim/types.hpp @@ -119,7 +119,7 @@ struct empty_type #ifdef ROCPRIM_HC_API using half = ::hc::half; #else // HIP - using half = ::__half; + using half = __half; #endif END_ROCPRIM_NAMESPACE @@ -127,4 +127,4 @@ END_ROCPRIM_NAMESPACE /// @} // end of group utilsmodule -#endif // ROCPRIM_TYPES_HPP_ \ No newline at end of file +#endif // ROCPRIM_TYPES_HPP_ diff --git a/rocprim/include/rocprim/types/tuple.hpp b/rocprim/include/rocprim/types/tuple.hpp index 870a31728..fc8c24389 100644 --- a/rocprim/include/rocprim/types/tuple.hpp +++ b/rocprim/include/rocprim/types/tuple.hpp @@ -186,7 +186,7 @@ namespace detail using is_final = std::integral_constant; #else template - struct is_final : std::false_type; + struct is_final : std::false_type { }; #endif diff --git a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp index 124f5bdca..82617c6e8 100644 --- a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp @@ -53,7 +53,7 @@ unsigned int last_in_warp_segment(Flag flag) // Make sure last item in logical warp is marked as a tail warp_flags |= ballot_type(1) << (WarpSize - 1U); // Calculate logical lane id of the last valid value in the segment - return hc::__lastbit_u32_u64(warp_flags); + return __lastbit_u32_u64(warp_flags); } } // end namespace detail diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 26faae8ab..0e4ee0af4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -61,12 +61,19 @@ function(add_hc_test TEST_NAME TEST_SOURCES) PUBLIC ${GTEST_INCLUDE_DIRS} ) - target_link_libraries(${TEST_TARGET} - PRIVATE - hcc::hccrt - hcc::hc_am - ${GTEST_BOTH_LIBRARIES} - ) + if(HIP_COMPILER STREQUAL "HCC") + target_link_libraries(${TEST_TARGET} + PRIVATE + hcc::hccrt + hcc::hc_am + ${GTEST_BOTH_LIBRARIES} + ) + elseif(HIP_COMPILER STREQUAL "clang") + target_link_libraries(${TEST_TARGET} + PRIVATE + ${GTEST_BOTH_LIBRARIES} + ) + endif() foreach(amdgpu_target ${AMDGPU_TARGETS}) target_link_libraries(${TEST_TARGET} PRIVATE @@ -87,7 +94,9 @@ endfunction() # HC and HIP tests without using rocPRIM if(HIP_PLATFORM STREQUAL "hcc") add_hip_test("hc.device_api" test_hip_api.cpp) - add_hc_test ("hip.device_api" test_hc_api.cpp) + if(HIP_COMPILER STREQUAL "HCC") + add_hc_test ("hip.device_api" test_hc_api.cpp) + endif() endif() # rocPRIM test (run only on ROCm/hcc) diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index cace5b347..bf6e120c2 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -63,6 +63,7 @@ endfunction() # # HCP basic test, which also checks if there are no linkage problems when there are multiple sources +if(HIP_COMPILER STREQUAL "HCC") add_rocprim_test_hc("rocprim.hc.basic_test" "test_hc_basic.cpp;detail/get_rocprim_version_hc.cpp") add_rocprim_test_hc("rocprim.hc.arg_index_iterator" test_hc_arg_index_iterator.cpp) @@ -98,7 +99,7 @@ add_rocprim_test_hc("rocprim.hc.warp_reduce" test_hc_warp_reduce.cpp) add_rocprim_test_hc("rocprim.hc.warp_scan" test_hc_warp_scan.cpp) add_rocprim_test_hc("rocprim.hc.warp_sort" test_hc_warp_sort.cpp) add_rocprim_test_hc("rocprim.hc.zip_iterator" test_hc_zip_iterator.cpp) - +endif() # # rocPRIM HIP API tests # diff --git a/test/rocprim/test_hip_block_exchange.cpp b/test/rocprim/test_hip_block_exchange.cpp index 4314e1f22..eb4ef6e76 100644 --- a/test/rocprim/test_hip_block_exchange.cpp +++ b/test/rocprim/test_hip_block_exchange.cpp @@ -28,8 +28,6 @@ // Google Test #include -// HC API -#include // rocPRIM API #include diff --git a/test/rocprim/test_hip_block_load_store.cpp b/test/rocprim/test_hip_block_load_store.cpp index 6739a79f7..674ea1d40 100644 --- a/test/rocprim/test_hip_block_load_store.cpp +++ b/test/rocprim/test_hip_block_load_store.cpp @@ -27,8 +27,8 @@ // Google Test #include // HC API -#include -#include +#include +#include // rocPRIM API #include @@ -221,26 +221,26 @@ typedef ::testing::Types< params, params, - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params, - - params, - params, - params, - params, - params, - params + params, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params, + + params, + params, + params, + params, + params, + params > Params; TYPED_TEST_CASE(RocprimBlockLoadStoreClassTests, ClassParams); diff --git a/test/rocprim/test_hip_transform_iterator.cpp b/test/rocprim/test_hip_transform_iterator.cpp index 55d64a81e..1190f5f1a 100644 --- a/test/rocprim/test_hip_transform_iterator.cpp +++ b/test/rocprim/test_hip_transform_iterator.cpp @@ -39,6 +39,7 @@ template struct times_two { + ROCPRIM_HOST_DEVICE T operator()(const T& value) const { return 2 * value; @@ -48,6 +49,7 @@ struct times_two template struct plus_ten { + ROCPRIM_HOST_DEVICE T operator()(const T& value) const { return value + 10; diff --git a/test/rocprim/test_hip_warp_reduce.cpp b/test/rocprim/test_hip_warp_reduce.cpp index e4fde26d8..0b144d79b 100644 --- a/test/rocprim/test_hip_warp_reduce.cpp +++ b/test/rocprim/test_hip_warp_reduce.cpp @@ -97,7 +97,7 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output) T value = device_input[index]; using wreduce_t = rp::warp_reduce; - tile_static typename wreduce_t::storage_type storage[warps_no]; + __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, storage[warp_id]); if(hipThreadIdx_x%LogicalWarpSize == 0) @@ -204,7 +204,7 @@ void warp_allreduce_sum_kernel(T* device_input, T* device_output) T value = device_input[index]; using wreduce_t = rp::warp_reduce; - tile_static typename wreduce_t::storage_type storage[warps_no]; + __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, storage[warp_id]); device_output[index] = value; @@ -312,7 +312,7 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid) T value = device_input[index]; using wreduce_t = rp::warp_reduce; - tile_static typename wreduce_t::storage_type storage[warps_no]; + __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, valid, storage[warp_id]); if(hipThreadIdx_x%LogicalWarpSize == 0) @@ -420,7 +420,7 @@ void warp_allreduce_sum_kernel(T* device_input, T* device_output, size_t valid) T value = device_input[index]; using wreduce_t = rp::warp_reduce; - tile_static typename wreduce_t::storage_type storage[warps_no]; + __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, valid, storage[warp_id]); device_output[index] = value; @@ -623,7 +623,7 @@ void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) auto flag = flags[index]; using wreduce_t = rp::warp_reduce; - tile_static typename wreduce_t::storage_type storage[warps_no]; + __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().head_segmented_reduce(value, value, flag, storage[warp_id]); output[index] = value; @@ -750,7 +750,7 @@ void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) auto flag = flags[index]; using wreduce_t = rp::warp_reduce; - tile_static typename wreduce_t::storage_type storage[warps_no]; + __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().tail_segmented_reduce(value, value, flag, storage[warp_id]); output[index] = value; diff --git a/test/rocprim/test_hip_warp_sort.cpp b/test/rocprim/test_hip_warp_sort.cpp index 9f896cb92..03484e4f2 100644 --- a/test/rocprim/test_hip_warp_sort.cpp +++ b/test/rocprim/test_hip_warp_sort.cpp @@ -52,7 +52,7 @@ class RocprimWarpSortShuffleBasedTests : public ::testing::Test { }; template -bool test(const T& a, const T& b) [[hc]] +bool test(const T& a, const T& b) { return a < b; }