Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -436,14 +436,15 @@ build:cuda-and-omp:
tags:
- build
variables:
CCCL_GIT_BRANCH: v2.6.0
CCCL_GIT_BRANCH: v2.7.0
CCCL_DIR: ${CI_PROJECT_DIR}/cccl
needs: []
script:
- git clone -b $CCCL_GIT_BRANCH https://github.com/NVIDIA/cccl.git $CCCL_DIR
# Replace CCCL Thrust headers with rocThrust headers
- rm -R $CCCL_DIR/thrust/thrust
- cp -r $CI_PROJECT_DIR/thrust $CCCL_DIR/thrust
- cp $CI_PROJECT_DIR/testing/type_traits.cu $CCCL_DIR/thrust/testing
# Build tests and examples from CCCL Thrust
- cmake
-G Ninja
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Documentation for rocThrust available at
### Changed

* Updated the required version of Google Benchmark from 1.8.0 to 1.9.0.
* Drop `c++14` support for rocthrust.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The min version of C++ that's now supported needs to be mentioned. I think hipCUB and rocPRIM already have this in their changelogs, so this should be identical to those.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in #559

* Renamed `cpp14_required.h` to `cpp_version_check.h`
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I saw this in either hipCUB or rocPRIM as well but can't remember if it was removed because this change should be transparent to the user, or if it was kept in just in case.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in #559


### Removed

Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg wit
check_language(HIP)
cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF)

# Allow the user to optionally select offset type dispatch to fixed 32 or 64 bit types
set(THRUST_DISPATCH_TYPE "Dynamic" CACHE STRING "Select Thrust offset type dispatch." FORCE)
set_property(CACHE THRUST_DISPATCH_TYPE PROPERTY STRINGS "Dynamic" "Force32bit" "Force64bit")

#Adding CMAKE_PREFIX_PATH
list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm ${ROCM_PATH} )

Expand Down
4 changes: 2 additions & 2 deletions examples/include/host_device.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright 2008-2009 NVIDIA Corporation
* Modifications Copyright© 2024 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright© 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,7 +17,7 @@

#pragma once

#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
#ifndef _CCCL_CUDA_COMPILER

# ifndef __host__
# define __host__
Expand Down
4 changes: 2 additions & 2 deletions test/test_constant_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,14 @@ TYPED_TEST(ConstantIteratorTests, ConstantIteratorTransform)
ConstIter last1 = first1 + result.size();
ConstIter first2 = make_constant_iterator<T>(3);

transform(first1, last1, result.begin(), negate<T>());
thrust::transform(first1, last1, result.begin(), thrust::negate<T>());

ASSERT_EQ(-7, result[0]);
ASSERT_EQ(-7, result[1]);
ASSERT_EQ(-7, result[2]);
ASSERT_EQ(-7, result[3]);

transform(first1, last1, first2, result.begin(), plus<T>());
thrust::transform(first1, last1, first2, result.begin(), thrust::plus<T>());

ASSERT_EQ(10, result[0]);
ASSERT_EQ(10, result[1]);
Expand Down
1 change: 1 addition & 0 deletions testing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ add_thrust_test("stable_sort_by_key_large_values")
add_thrust_test("stable_sort_large")
add_thrust_test("swap_ranges")
add_thrust_test("tabulate")
add_thrust_test("tabulate_output_iterator")
add_thrust_test("transform")
add_thrust_test("transform_iterator")
add_thrust_test("transform_input_output_iterator")
Expand Down
6 changes: 3 additions & 3 deletions testing/allocator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ struct my_allocator_with_custom_destroy

THRUST_HOST ~my_allocator_with_custom_destroy() {}

THRUST_HOST_DEVICE void destroy(T*)
THRUST_HOST_DEVICE void destroy(T*) noexcept
{
NV_IF_TARGET(NV_IS_HOST, (g_state = true;));
}
Expand All @@ -113,7 +113,7 @@ struct my_allocator_with_custom_destroy
return use_me_to_alloc.allocate(n);
}

void deallocate(value_type* ptr, std::ptrdiff_t n)
void deallocate(value_type* ptr, std::ptrdiff_t n) noexcept
{
use_me_to_alloc.deallocate(ptr, n);
}
Expand Down Expand Up @@ -175,7 +175,7 @@ struct my_minimal_allocator
return use_me_to_alloc.allocate(n);
}

void deallocate(value_type* ptr, std::ptrdiff_t n)
void deallocate(value_type* ptr, std::ptrdiff_t n) noexcept
{
use_me_to_alloc.deallocate(ptr, n);
}
Expand Down
4 changes: 2 additions & 2 deletions testing/async_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <thrust/detail/config.h>

// Disabled on MSVC && NVCC < 11.1 for GH issue #1098.
#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && defined(__CUDACC__)
#if defined(_CCCL_COMPILER_MSVC) && defined(__CUDACC__)
# if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1)
# define THRUST_BUG_1098_ACTIVE
# endif // NVCC version check
Expand Down Expand Up @@ -102,7 +102,7 @@ struct test_async_sort
template <typename T>
struct tester
{
THRUST_HOST void operator()(std::size_t n)
THRUST_HOST_DEVICE void operator()(std::size_t n)
{
thrust::host_vector<T> h0_data(unittest::random_integers<T>(n));
thrust::device_vector<T> d0_data(h0_data);
Expand Down
31 changes: 31 additions & 0 deletions testing/cuda/scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ __global__ void inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Ite
thrust::inclusive_scan(exec, first, last, result);
}

template <typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T, typename Pred>
__global__ void
inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result, T init, Pred pred)
{
thrust::inclusive_scan(exec, first, last, result, init, pred);
}

template <typename ExecutionPolicy, typename Iterator1, typename Iterator2>
__global__ void exclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
{
Expand Down Expand Up @@ -60,6 +67,16 @@ void TestScanDevice(ExecutionPolicy exec, const size_t n)

ASSERT_EQUAL(d_output, h_output);

thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), (T) 11, thrust::plus<T>{});

inclusive_scan_kernel<<<1, 1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), (T) 11, thrust::plus<T>{});
{
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);
}

ASSERT_EQUAL(d_output, h_output);

thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin());

exclusive_scan_kernel<<<1, 1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
Expand Down Expand Up @@ -203,6 +220,20 @@ void TestScanCudaStreams()
ASSERT_EQUAL(input, input_copy);
ASSERT_EQUAL(output, result);

// inclusive scan with init and op
iter =
thrust::inclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), 3, thrust::plus<T>());
cudaStreamSynchronize(s);

result[0] = 4;
result[1] = 7;
result[2] = 5;
result[3] = 9;
result[4] = 4;
ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
ASSERT_EQUAL(input, input_copy);
ASSERT_EQUAL(output, result);

// exclusive scan with init and op
iter =
thrust::exclusive_scan(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), 3, thrust::plus<T>());
Expand Down
173 changes: 173 additions & 0 deletions testing/cuda/sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,35 @@
* limitations under the License.
*/

#include <thrust/copy.h>
#include <thrust/equal.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/reverse_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/sort.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
# include <cuda/std/limits>
namespace _std = ::cuda::std;
#elif defined(__has_include)
# if __has_include(<cuda/std/functional>)
# include <cuda/std/limits>
namespace _std = ::cuda::std;
# else
# include <limits>
namespace _std = std;
# endif
#else
# include <limits>
namespace _std = std;
#endif

#include <cstdint>
#include <exception>
#include <utility>

#include <unittest/unittest.h>

template <typename T>
Expand Down Expand Up @@ -155,3 +180,151 @@ void TestComparisonSortCudaStreams()
cudaStreamDestroy(s);
}
DECLARE_UNITTEST(TestComparisonSortCudaStreams);

template <typename T>
struct TestRadixSortDispatch
{
static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, thrust::less<T>>::value, "");
static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, thrust::greater<T>>::value, "");
static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, std::less<T>>::value, "");
static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, std::greater<T>>::value, "");

static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, thrust::less<>>::value, "");
static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, thrust::greater<>>::value, "");
static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, std::less<>>::value, "");
static_assert(thrust::cuda_cub::__smart_sort::can_use_primitive_sort<T, std::greater<>>::value, "");

void operator()() const {}
};
// TODO(bgruber): use a single test case with a concatenated key list and a cartesion product with the comparators
SimpleUnitTest<TestRadixSortDispatch, IntegralTypes> TestRadixSortDispatchIntegralInstance;
SimpleUnitTest<TestRadixSortDispatch, FloatingPointTypes> TestRadixSortDispatchFPInstance;
/**
* Copy of CUB testing utility
*/
template <typename UnsignedIntegralKeyT>
struct index_to_key_value_op
{
static constexpr std::size_t max_key_value =
static_cast<std::size_t>(_std::numeric_limits<UnsignedIntegralKeyT>::max());
static constexpr std::size_t lowest_key_value =
static_cast<std::size_t>(_std::numeric_limits<UnsignedIntegralKeyT>::lowest());
static constexpr std::size_t num_distinct_key_values = (max_key_value - lowest_key_value + std::size_t{1ULL});

__device__ __host__ UnsignedIntegralKeyT operator()(std::size_t index)
{
return static_cast<UnsignedIntegralKeyT>(index % num_distinct_key_values);
}
};

/**
* Copy of CUB testing utility
*/
template <typename UnsignedIntegralKeyT>
class index_to_expected_key_op
{
private:
static constexpr std::size_t max_key_value =
static_cast<std::size_t>(_std::numeric_limits<UnsignedIntegralKeyT>::max());
static constexpr std::size_t lowest_key_value =
static_cast<std::size_t>(_std::numeric_limits<UnsignedIntegralKeyT>::lowest());
static constexpr std::size_t num_distinct_key_values = (max_key_value - lowest_key_value + std::size_t{1ULL});

// item_count / num_distinct_key_values
std::size_t expected_count_per_item;
// num remainder items: item_count%num_distinct_key_values
std::size_t num_remainder_items;
// remainder item_count: expected_count_per_item+1
std::size_t remainder_item_count;

public:
index_to_expected_key_op(std::size_t num_total_items)
: expected_count_per_item(num_total_items / num_distinct_key_values)
, num_remainder_items(num_total_items % num_distinct_key_values)
, remainder_item_count(expected_count_per_item + std::size_t{1ULL})
{}

__device__ __host__ UnsignedIntegralKeyT operator()(std::size_t index)
{
// The first (num_remainder_items * remainder_item_count) are items that appear once more often than the items that
// follow remainder_items_offset
std::size_t remainder_items_offset = num_remainder_items * remainder_item_count;

UnsignedIntegralKeyT target_item_index =
(index <= remainder_items_offset)
?
// This is one of the remainder items
static_cast<UnsignedIntegralKeyT>(index / remainder_item_count)
:
// This is an item that appears exactly expected_count_per_item times
static_cast<UnsignedIntegralKeyT>(
num_remainder_items + ((index - remainder_items_offset) / expected_count_per_item));
return target_item_index;
}
};

void TestSortWithMagnitude(int magnitude)
{
try
{
const std::size_t num_items = 1ull << magnitude;
thrust::device_vector<std::uint8_t> vec(num_items);
auto counting_it = thrust::make_counting_iterator(std::size_t{0});
auto key_value_it = thrust::make_transform_iterator(counting_it, index_to_key_value_op<std::uint8_t>{});
auto rev_sorted_it = thrust::make_reverse_iterator(key_value_it + num_items);
thrust::copy(rev_sorted_it, rev_sorted_it + num_items, vec.begin());
thrust::sort(vec.begin(), vec.end());
auto expected_result_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(std::size_t{}), index_to_expected_key_op<std::uint8_t>(num_items));
const bool ok = thrust::equal(expected_result_it, expected_result_it + num_items, vec.cbegin());
ASSERT_EQUAL(ok, true);
}
catch (std::bad_alloc&)
{}
}

void TestSortWithLargeNumberOfItems()
{
TestSortWithMagnitude(39);
TestSortWithMagnitude(32);
TestSortWithMagnitude(33);
}
DECLARE_UNITTEST(TestSortWithLargeNumberOfItems);

template <typename T>
struct TestSortAscendingKey
{
void operator()() const
{
constexpr int n = 10000;

thrust::host_vector<T> h_data = unittest::random_integers<T>(n);
thrust::device_vector<T> d_data = h_data;

std::sort(h_data.begin(), h_data.end(), thrust::less<T>{});
thrust::sort(d_data.begin(), d_data.end(), thrust::less<T>{});

ASSERT_EQUAL_QUIET(h_data, d_data);
}
};

SimpleUnitTest<TestSortAscendingKey,
unittest::concat<unittest::type_list<>
#ifndef _LIBCUDACXX_HAS_NO_INT128
,
unittest::type_list<__int128_t, __uint128_t>
#endif
// CTK 12.2 offers __host__ __device__ operators for __half and __nv_bfloat16, so we can use std::sort
#if _CCCL_CUDACC_VER >= 1202000
# if defined(_CCCL_HAS_NVFP16) || !defined(__CUDA_NO_HALF_OPERATORS__) && !defined(__CUDA_NO_HALF_CONVERSIONS__)
,
unittest::type_list<__half>
# endif
# if defined(_CCCL_HAS_NVBF16) \
|| !defined(__CUDA_NO_BFLOAT16_OPERATORS__) && !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
,
unittest::type_list<__nv_bfloat16>
# endif
#endif // _CCCL_CUDACC_VER >= 1202000
>>
TestSortAscendingKeyMoreTypes;
Loading