Skip to content

Commit

Permalink
DeviceFor::ForEachInExtents (#2518)
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato authored Nov 20, 2024
1 parent 8994dc4 commit 2e6ed59
Show file tree
Hide file tree
Showing 9 changed files with 1,294 additions and 6 deletions.
241 changes: 241 additions & 0 deletions cub/cub/detail/fast_modulo_division.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
/******************************************************************************
* Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/cmath> // cuda::std::ceil_div
#include <cuda/std/bit> // std::has_single_bit
#include <cuda/std/climits> // CHAR_BIT
#include <cuda/std/cstdint> // uint64_t
#include <cuda/std/limits> // numeric_limits
#include <cuda/std/type_traits> // std::is_integral

#include "cub/detail/type_traits.cuh" // implicit_prom_t
#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED

#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
_CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
#endif // CCCL_ENABLE_DEVICE_ASSERTIONS

CUB_NAMESPACE_BEGIN

namespace detail
{

/***********************************************************************************************************************
* larger_unsigned_type
**********************************************************************************************************************/

template <typename T, typename = void>
struct larger_unsigned_type
{
static_assert(sizeof(T) >= 8, "64-bit integer are only supported from CUDA >= 11.5");
using type = void;
};

template <typename T>
struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) < 4)>::type>
{
using type = ::cuda::std::uint32_t;
};

template <typename T>
struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 4)>::type>
{
using type = ::cuda::std::uint64_t;
};

#if CUB_IS_INT128_ENABLED

template <typename T>
struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 8)>::type>
{
using type = __uint128_t;
};

#endif // CUB_IS_INT128_ENABLED

template <typename T>
using larger_unsigned_type_t = typename larger_unsigned_type<T>::type;

template <typename T>
using unsigned_implicit_prom_t = typename ::cuda::std::make_unsigned<implicit_prom_t<T>>::type;

template <typename T>
using supported_integral = ::cuda::std::bool_constant<
::cuda::std::is_integral<T>::value && !::cuda::std::is_same<T, bool>::value && (sizeof(T) <= 8)>;

/***********************************************************************************************************************
* Extract higher bits after multiplication
**********************************************************************************************************************/

template <typename DivisorType, typename T, typename R>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_implicit_prom_t<DivisorType>
multiply_extract_higher_bits(T value, R multiplier)
{
static_assert(supported_integral<T>::value, "unsupported type");
static_assert(supported_integral<R>::value, "unsupported type");
_CCCL_ASSERT(value >= 0, "value must be non-negative");
_CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative");
static constexpr int NumBits = sizeof(DivisorType) * CHAR_BIT;
using unsigned_t = unsigned_implicit_prom_t<DivisorType>;
using larger_t = larger_unsigned_type_t<DivisorType>;
// clang-format off
NV_IF_TARGET(
NV_IS_HOST,
(return static_cast<unsigned_t>((static_cast<larger_t>(value) * multiplier) >> NumBits);),
//NV_IS_DEVICE
(return (sizeof(T) == 8)
? static_cast<unsigned_t>(__umul64hi(value, multiplier))
: static_cast<unsigned_t>((static_cast<larger_t>(value) * multiplier) >> NumBits);));
// clang-format on
}

/***********************************************************************************************************************
* Fast Modulo/Division based on Precomputation
**********************************************************************************************************************/

_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */

template <typename T1>
class fast_div_mod
{
static_assert(supported_integral<T1>::value, "unsupported type");

// uint16_t is a special case that would requires complex logic. Workaround: convert to int
using T = ::cuda::std::conditional_t<::cuda::std::is_same<T1, ::cuda::std::uint16_t>::value, int, T1>;
using unsigned_t = unsigned_implicit_prom_t<T>;

public:
template <typename R>
struct result
{
using common_t = decltype(R{} / T{});
common_t quotient;
common_t remainder;
};

fast_div_mod() = delete;

_CCCL_NODISCARD _CCCL_HOST_DEVICE explicit fast_div_mod(T divisor) noexcept
: _divisor{static_cast<unsigned_t>(divisor)}
{
using larger_t = larger_unsigned_type_t<T>;
_CCCL_ASSERT(divisor > 0, "divisor must be positive");
auto udivisor = static_cast<unsigned_t>(divisor);
// the following branches are needed to avoid negative shift
if (::cuda::std::has_single_bit(udivisor)) // power of two
{
_shift_right = ::cuda::std::bit_width(udivisor) - 1;
return;
}
else if (sizeof(T) == 8 && divisor == 3)
{
return;
}
constexpr int BitSize = sizeof(T) * CHAR_BIT; // 32
constexpr int BitOffset = BitSize / 16; // 2
int num_bits = ::cuda::std::bit_width(udivisor) + 1;
_CCCL_ASSERT(static_cast<size_t>(num_bits + BitSize - BitOffset) < sizeof(larger_t) * CHAR_BIT, "overflow error");
// without explicit power-of-two check, num_bits needs to replace +1 with !::cuda::std::has_single_bit(udivisor)
_multiplier = static_cast<unsigned_t>(::cuda::ceil_div(larger_t{1} << (num_bits + BitSize - BitOffset), //
static_cast<larger_t>(divisor)));
_shift_right = num_bits - BitOffset;
_CCCL_ASSERT(_multiplier != 0, "overflow error");
}

fast_div_mod(const fast_div_mod&) noexcept = default;

fast_div_mod(fast_div_mod&&) noexcept = default;

template <typename R>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE result<R> operator()(R dividend) const noexcept
{
static_assert(supported_integral<R>::value, "unsupported type");
using common_t = decltype(R{} / T{});
using ucommon_t = ::cuda::std::make_unsigned_t<common_t>;
using result_t = result<R>;
_CCCL_ASSERT(dividend >= 0, "divisor must be non-negative");
auto udividend = static_cast<ucommon_t>(dividend);
if (_divisor == 1)
{
return result_t{static_cast<common_t>(dividend), common_t{}};
}
else if (_divisor > unsigned_t{::cuda::std::numeric_limits<T>::max() / 2})
{
auto quotient = udividend >= static_cast<ucommon_t>(_divisor);
return result_t{static_cast<common_t>(quotient), static_cast<common_t>(udividend - (quotient * _divisor))};
}
else if (sizeof(T) == 8 && _divisor == 3)
{
return result_t{static_cast<common_t>(udividend / 3), static_cast<common_t>(udividend % 3)};
}
auto higher_bits = (_multiplier == 0) ? udividend : multiply_extract_higher_bits<T>(dividend, _multiplier);
auto quotient = higher_bits >> _shift_right;
auto remainder = udividend - (quotient * _divisor);
_CCCL_ASSERT(quotient == udividend / _divisor, "wrong quotient");
_CCCL_ASSERT(remainder < (ucommon_t) _divisor, "remainder out of range");
return result_t{static_cast<common_t>(quotient), static_cast<common_t>(remainder)};
}

template <typename R>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE friend implicit_prom_t<T> operator/(R dividend, fast_div_mod div) noexcept
{
return div(dividend).quotient;
}

template <typename R>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE friend implicit_prom_t<T> operator%(R dividend, fast_div_mod div) noexcept
{
return div(dividend).remainder;
}

private:
unsigned_t _divisor = 1;
unsigned_t _multiplier = 0;
unsigned _shift_right = 0;
};
_CCCL_DIAG_POP

} // namespace detail

CUB_NAMESPACE_END

#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
_CCCL_NV_DIAG_DEFAULT(186)
#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
122 changes: 122 additions & 0 deletions cub/cub/detail/mdspan_utils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/******************************************************************************
* Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if __cccl_lib_mdspan

# include <cub/detail/fast_modulo_division.cuh> // fast_div_mod

# include <cuda/std/array> // std::array
# include <cuda/std/cstddef> // size_t
# include <cuda/std/mdspan>
# include <cuda/std/type_traits> // make_unsigned_t
# include <cuda/std/utility> // ::cuda::std::index_sequence

CUB_NAMESPACE_BEGIN

namespace detail
{

// Compute the submdspan size of a given rank
template <::cuda::std::size_t Rank, typename IndexType, ::cuda::std::size_t Extent0, ::cuda::std::size_t... Extents>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
sub_size(const ::cuda::std::extents<IndexType, Extent0, Extents...>& ext)
{
::cuda::std::make_unsigned_t<IndexType> s = 1;
for (IndexType i = Rank; i < IndexType{1 + sizeof...(Extents)}; i++) // <- pointless comparison with zero-rank extent
{
s *= ext.extent(i);
}
return s;
}

// avoid pointless comparison of unsigned integer with zero (nvcc 11.x doesn't support nv_diag warning suppression)
template <::cuda::std::size_t Rank, typename IndexType>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
sub_size(const ::cuda::std::extents<IndexType>&)
{
return ::cuda::std::make_unsigned_t<IndexType>{1};
}

// TODO: move to cuda::std
template <typename IndexType, ::cuda::std::size_t... Extents>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
size(const ::cuda::std::extents<IndexType, Extents...>& ext)
{
return sub_size<0>(ext);
}

// precompute modulo/division for each submdspan size (by rank)
template <typename IndexType, ::cuda::std::size_t... E, ::cuda::std::size_t... Ranks>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
{
// deduction guides don't work with nvcc 11.x
using fast_mod_div_t = fast_div_mod<IndexType>;
return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(sub_size<Ranks + 1>(ext))...};
}

// precompute modulo/division for each mdspan extent
template <typename IndexType, ::cuda::std::size_t... E, ::cuda::std::size_t... Ranks>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
{
using fast_mod_div_t = fast_div_mod<IndexType>;
return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(ext.extent(Ranks))...};
}

// GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
template <int Rank, typename Extents>
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_sub_size_static()
{
using index_type = typename Extents::index_type;
for (index_type i = Rank; i < Extents::rank(); i++)
{
if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
{
return false;
}
}
return true;
}

} // namespace detail

CUB_NAMESPACE_END

#endif // if __cccl_lib_mdspan
Loading

0 comments on commit 2e6ed59

Please sign in to comment.