Skip to content

Commit 2e6ed59

Browse files
authored
DeviceFor::ForEachInExtents (#2518)
1 parent 8994dc4 commit 2e6ed59

9 files changed

+1294
-6
lines changed
+241
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
/******************************************************************************
2+
* Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are met:
6+
* * Redistributions of source code must retain the above copyright
7+
* notice, this list of conditions and the following disclaimer.
8+
* * Redistributions in binary form must reproduce the above copyright
9+
* notice, this list of conditions and the following disclaimer in the
10+
* documentation and/or other materials provided with the distribution.
11+
* * Neither the name of the NVIDIA CORPORATION nor the
12+
* names of its contributors may be used to endorse or promote products
13+
* derived from this software without specific prior written permission.
14+
*
15+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18+
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
*
26+
******************************************************************************/
27+
28+
#pragma once
29+
30+
#include <cub/config.cuh>
31+
32+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
33+
# pragma GCC system_header
34+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
35+
# pragma clang system_header
36+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
37+
# pragma system_header
38+
#endif // no system header
39+
40+
#include <cuda/cmath> // cuda::std::ceil_div
41+
#include <cuda/std/bit> // std::has_single_bit
42+
#include <cuda/std/climits> // CHAR_BIT
43+
#include <cuda/std/cstdint> // uint64_t
44+
#include <cuda/std/limits> // numeric_limits
45+
#include <cuda/std/type_traits> // std::is_integral
46+
47+
#include "cub/detail/type_traits.cuh" // implicit_prom_t
48+
#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED
49+
50+
#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
51+
_CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
52+
#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
53+
54+
CUB_NAMESPACE_BEGIN
55+
56+
namespace detail
57+
{
58+
59+
/***********************************************************************************************************************
60+
* larger_unsigned_type
61+
**********************************************************************************************************************/
62+
63+
template <typename T, typename = void>
64+
struct larger_unsigned_type
65+
{
66+
static_assert(sizeof(T) >= 8, "64-bit integer are only supported from CUDA >= 11.5");
67+
using type = void;
68+
};
69+
70+
template <typename T>
71+
struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) < 4)>::type>
72+
{
73+
using type = ::cuda::std::uint32_t;
74+
};
75+
76+
template <typename T>
77+
struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 4)>::type>
78+
{
79+
using type = ::cuda::std::uint64_t;
80+
};
81+
82+
#if CUB_IS_INT128_ENABLED
83+
84+
template <typename T>
85+
struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 8)>::type>
86+
{
87+
using type = __uint128_t;
88+
};
89+
90+
#endif // CUB_IS_INT128_ENABLED
91+
92+
template <typename T>
93+
using larger_unsigned_type_t = typename larger_unsigned_type<T>::type;
94+
95+
template <typename T>
96+
using unsigned_implicit_prom_t = typename ::cuda::std::make_unsigned<implicit_prom_t<T>>::type;
97+
98+
template <typename T>
99+
using supported_integral = ::cuda::std::bool_constant<
100+
::cuda::std::is_integral<T>::value && !::cuda::std::is_same<T, bool>::value && (sizeof(T) <= 8)>;
101+
102+
/***********************************************************************************************************************
103+
* Extract higher bits after multiplication
104+
**********************************************************************************************************************/
105+
106+
template <typename DivisorType, typename T, typename R>
107+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_implicit_prom_t<DivisorType>
108+
multiply_extract_higher_bits(T value, R multiplier)
109+
{
110+
static_assert(supported_integral<T>::value, "unsupported type");
111+
static_assert(supported_integral<R>::value, "unsupported type");
112+
_CCCL_ASSERT(value >= 0, "value must be non-negative");
113+
_CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative");
114+
static constexpr int NumBits = sizeof(DivisorType) * CHAR_BIT;
115+
using unsigned_t = unsigned_implicit_prom_t<DivisorType>;
116+
using larger_t = larger_unsigned_type_t<DivisorType>;
117+
// clang-format off
118+
NV_IF_TARGET(
119+
NV_IS_HOST,
120+
(return static_cast<unsigned_t>((static_cast<larger_t>(value) * multiplier) >> NumBits);),
121+
//NV_IS_DEVICE
122+
(return (sizeof(T) == 8)
123+
? static_cast<unsigned_t>(__umul64hi(value, multiplier))
124+
: static_cast<unsigned_t>((static_cast<larger_t>(value) * multiplier) >> NumBits);));
125+
// clang-format on
126+
}
127+
128+
/***********************************************************************************************************************
129+
* Fast Modulo/Division based on Precomputation
130+
**********************************************************************************************************************/
131+
132+
_CCCL_DIAG_PUSH
133+
_CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */
134+
135+
template <typename T1>
136+
class fast_div_mod
137+
{
138+
static_assert(supported_integral<T1>::value, "unsupported type");
139+
140+
// uint16_t is a special case that would requires complex logic. Workaround: convert to int
141+
using T = ::cuda::std::conditional_t<::cuda::std::is_same<T1, ::cuda::std::uint16_t>::value, int, T1>;
142+
using unsigned_t = unsigned_implicit_prom_t<T>;
143+
144+
public:
145+
template <typename R>
146+
struct result
147+
{
148+
using common_t = decltype(R{} / T{});
149+
common_t quotient;
150+
common_t remainder;
151+
};
152+
153+
fast_div_mod() = delete;
154+
155+
_CCCL_NODISCARD _CCCL_HOST_DEVICE explicit fast_div_mod(T divisor) noexcept
156+
: _divisor{static_cast<unsigned_t>(divisor)}
157+
{
158+
using larger_t = larger_unsigned_type_t<T>;
159+
_CCCL_ASSERT(divisor > 0, "divisor must be positive");
160+
auto udivisor = static_cast<unsigned_t>(divisor);
161+
// the following branches are needed to avoid negative shift
162+
if (::cuda::std::has_single_bit(udivisor)) // power of two
163+
{
164+
_shift_right = ::cuda::std::bit_width(udivisor) - 1;
165+
return;
166+
}
167+
else if (sizeof(T) == 8 && divisor == 3)
168+
{
169+
return;
170+
}
171+
constexpr int BitSize = sizeof(T) * CHAR_BIT; // 32
172+
constexpr int BitOffset = BitSize / 16; // 2
173+
int num_bits = ::cuda::std::bit_width(udivisor) + 1;
174+
_CCCL_ASSERT(static_cast<size_t>(num_bits + BitSize - BitOffset) < sizeof(larger_t) * CHAR_BIT, "overflow error");
175+
// without explicit power-of-two check, num_bits needs to replace +1 with !::cuda::std::has_single_bit(udivisor)
176+
_multiplier = static_cast<unsigned_t>(::cuda::ceil_div(larger_t{1} << (num_bits + BitSize - BitOffset), //
177+
static_cast<larger_t>(divisor)));
178+
_shift_right = num_bits - BitOffset;
179+
_CCCL_ASSERT(_multiplier != 0, "overflow error");
180+
}
181+
182+
fast_div_mod(const fast_div_mod&) noexcept = default;
183+
184+
fast_div_mod(fast_div_mod&&) noexcept = default;
185+
186+
template <typename R>
187+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE result<R> operator()(R dividend) const noexcept
188+
{
189+
static_assert(supported_integral<R>::value, "unsupported type");
190+
using common_t = decltype(R{} / T{});
191+
using ucommon_t = ::cuda::std::make_unsigned_t<common_t>;
192+
using result_t = result<R>;
193+
_CCCL_ASSERT(dividend >= 0, "divisor must be non-negative");
194+
auto udividend = static_cast<ucommon_t>(dividend);
195+
if (_divisor == 1)
196+
{
197+
return result_t{static_cast<common_t>(dividend), common_t{}};
198+
}
199+
else if (_divisor > unsigned_t{::cuda::std::numeric_limits<T>::max() / 2})
200+
{
201+
auto quotient = udividend >= static_cast<ucommon_t>(_divisor);
202+
return result_t{static_cast<common_t>(quotient), static_cast<common_t>(udividend - (quotient * _divisor))};
203+
}
204+
else if (sizeof(T) == 8 && _divisor == 3)
205+
{
206+
return result_t{static_cast<common_t>(udividend / 3), static_cast<common_t>(udividend % 3)};
207+
}
208+
auto higher_bits = (_multiplier == 0) ? udividend : multiply_extract_higher_bits<T>(dividend, _multiplier);
209+
auto quotient = higher_bits >> _shift_right;
210+
auto remainder = udividend - (quotient * _divisor);
211+
_CCCL_ASSERT(quotient == udividend / _divisor, "wrong quotient");
212+
_CCCL_ASSERT(remainder < (ucommon_t) _divisor, "remainder out of range");
213+
return result_t{static_cast<common_t>(quotient), static_cast<common_t>(remainder)};
214+
}
215+
216+
template <typename R>
217+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE friend implicit_prom_t<T> operator/(R dividend, fast_div_mod div) noexcept
218+
{
219+
return div(dividend).quotient;
220+
}
221+
222+
template <typename R>
223+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE friend implicit_prom_t<T> operator%(R dividend, fast_div_mod div) noexcept
224+
{
225+
return div(dividend).remainder;
226+
}
227+
228+
private:
229+
unsigned_t _divisor = 1;
230+
unsigned_t _multiplier = 0;
231+
unsigned _shift_right = 0;
232+
};
233+
_CCCL_DIAG_POP
234+
235+
} // namespace detail
236+
237+
CUB_NAMESPACE_END
238+
239+
#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
240+
_CCCL_NV_DIAG_DEFAULT(186)
241+
#endif // CCCL_ENABLE_DEVICE_ASSERTIONS

cub/cub/detail/mdspan_utils.cuh

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/******************************************************************************
2+
* Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are met:
6+
* * Redistributions of source code must retain the above copyright
7+
* notice, this list of conditions and the following disclaimer.
8+
* * Redistributions in binary form must reproduce the above copyright
9+
* notice, this list of conditions and the following disclaimer in the
10+
* documentation and/or other materials provided with the distribution.
11+
* * Neither the name of the NVIDIA CORPORATION nor the
12+
* names of its contributors may be used to endorse or promote products
13+
* derived from this software without specific prior written permission.
14+
*
15+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18+
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
*
26+
******************************************************************************/
27+
28+
#pragma once
29+
30+
#include <cub/config.cuh>
31+
32+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
33+
# pragma GCC system_header
34+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
35+
# pragma clang system_header
36+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
37+
# pragma system_header
38+
#endif // no system header
39+
40+
#if __cccl_lib_mdspan
41+
42+
# include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
43+
44+
# include <cuda/std/array> // std::array
45+
# include <cuda/std/cstddef> // size_t
46+
# include <cuda/std/mdspan>
47+
# include <cuda/std/type_traits> // make_unsigned_t
48+
# include <cuda/std/utility> // ::cuda::std::index_sequence
49+
50+
CUB_NAMESPACE_BEGIN
51+
52+
namespace detail
53+
{
54+
55+
// Compute the submdspan size of a given rank
56+
template <::cuda::std::size_t Rank, typename IndexType, ::cuda::std::size_t Extent0, ::cuda::std::size_t... Extents>
57+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
58+
sub_size(const ::cuda::std::extents<IndexType, Extent0, Extents...>& ext)
59+
{
60+
::cuda::std::make_unsigned_t<IndexType> s = 1;
61+
for (IndexType i = Rank; i < IndexType{1 + sizeof...(Extents)}; i++) // <- pointless comparison with zero-rank extent
62+
{
63+
s *= ext.extent(i);
64+
}
65+
return s;
66+
}
67+
68+
// avoid pointless comparison of unsigned integer with zero (nvcc 11.x doesn't support nv_diag warning suppression)
69+
template <::cuda::std::size_t Rank, typename IndexType>
70+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
71+
sub_size(const ::cuda::std::extents<IndexType>&)
72+
{
73+
return ::cuda::std::make_unsigned_t<IndexType>{1};
74+
}
75+
76+
// TODO: move to cuda::std
77+
template <typename IndexType, ::cuda::std::size_t... Extents>
78+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
79+
size(const ::cuda::std::extents<IndexType, Extents...>& ext)
80+
{
81+
return sub_size<0>(ext);
82+
}
83+
84+
// precompute modulo/division for each submdspan size (by rank)
85+
template <typename IndexType, ::cuda::std::size_t... E, ::cuda::std::size_t... Ranks>
86+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
87+
sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
88+
{
89+
// deduction guides don't work with nvcc 11.x
90+
using fast_mod_div_t = fast_div_mod<IndexType>;
91+
return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(sub_size<Ranks + 1>(ext))...};
92+
}
93+
94+
// precompute modulo/division for each mdspan extent
95+
template <typename IndexType, ::cuda::std::size_t... E, ::cuda::std::size_t... Ranks>
96+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
97+
extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
98+
{
99+
using fast_mod_div_t = fast_div_mod<IndexType>;
100+
return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(ext.extent(Ranks))...};
101+
}
102+
103+
// GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
104+
template <int Rank, typename Extents>
105+
_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_sub_size_static()
106+
{
107+
using index_type = typename Extents::index_type;
108+
for (index_type i = Rank; i < Extents::rank(); i++)
109+
{
110+
if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
111+
{
112+
return false;
113+
}
114+
}
115+
return true;
116+
}
117+
118+
} // namespace detail
119+
120+
CUB_NAMESPACE_END
121+
122+
#endif // if __cccl_lib_mdspan

0 commit comments

Comments
 (0)