NVIDIA
diff --git a/‎cub/cub/detail/fast_modulo_division.cuh
+241 b/‎cub/cub/detail/fast_modulo_division.cuh
+241
diff --git a/‎cub/cub/detail/mdspan_utils.cuh
+122 b/‎cub/cub/detail/mdspan_utils.cuh
+122
@@ -0,0 +1,241 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/cmath> // cuda::std::ceil_div
+#include <cuda/std/bit> // std::has_single_bit
+#include <cuda/std/climits> // CHAR_BIT
+#include <cuda/std/cstdint> // uint64_t
+#include <cuda/std/limits> // numeric_limits
+#include <cuda/std/type_traits> // std::is_integral
+
+#include "cub/detail/type_traits.cuh" // implicit_prom_t
+#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED
+
+#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
+_CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/***********************************************************************************************************************
+ * larger_unsigned_type
+ **********************************************************************************************************************/
+
+template <typename T, typename = void>
+struct larger_unsigned_type
+{
+  static_assert(sizeof(T) >= 8, "64-bit integer are only supported from CUDA >= 11.5");
+  using type = void;
+};
+
+template <typename T>
+struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) < 4)>::type>
+{
+  using type = ::cuda::std::uint32_t;
+};
+
+template <typename T>
+struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 4)>::type>
+{
+  using type = ::cuda::std::uint64_t;
+};
+
+#if CUB_IS_INT128_ENABLED
+
+template <typename T>
+struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 8)>::type>
+{
+  using type = __uint128_t;
+};
+
+#endif // CUB_IS_INT128_ENABLED
+
+template <typename T>
+using larger_unsigned_type_t = typename larger_unsigned_type<T>::type;
+
+template <typename T>
+using unsigned_implicit_prom_t = typename ::cuda::std::make_unsigned<implicit_prom_t<T>>::type;
+
+template <typename T>
+using supported_integral = ::cuda::std::bool_constant<
+  ::cuda::std::is_integral<T>::value && !::cuda::std::is_same<T, bool>::value && (sizeof(T) <= 8)>;
+
+/***********************************************************************************************************************
+ * Extract higher bits after multiplication
+ **********************************************************************************************************************/
+
+template <typename DivisorType, typename T, typename R>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned_implicit_prom_t<DivisorType>
+multiply_extract_higher_bits(T value, R multiplier)
+{
+  static_assert(supported_integral<T>::value, "unsupported type");
+  static_assert(supported_integral<R>::value, "unsupported type");
+  _CCCL_ASSERT(value >= 0, "value must be non-negative");
+  _CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative");
+  static constexpr int NumBits = sizeof(DivisorType) * CHAR_BIT;
+  using unsigned_t             = unsigned_implicit_prom_t<DivisorType>;
+  using larger_t               = larger_unsigned_type_t<DivisorType>;
+  // clang-format off
+  NV_IF_TARGET(
+    NV_IS_HOST,
+      (return static_cast<unsigned_t>((static_cast<larger_t>(value) * multiplier) >> NumBits);),
+    //NV_IS_DEVICE
+      (return (sizeof(T) == 8)
+        ? static_cast<unsigned_t>(__umul64hi(value, multiplier))
+        : static_cast<unsigned_t>((static_cast<larger_t>(value) * multiplier) >> NumBits);));
+  // clang-format on
+}
+
+/***********************************************************************************************************************
+ * Fast Modulo/Division based on Precomputation
+ **********************************************************************************************************************/
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */
+
+template <typename T1>
+class fast_div_mod
+{
+  static_assert(supported_integral<T1>::value, "unsupported type");
+
+  // uint16_t is a special case that would requires complex logic. Workaround: convert to int
+  using T          = ::cuda::std::conditional_t<::cuda::std::is_same<T1, ::cuda::std::uint16_t>::value, int, T1>;
+  using unsigned_t = unsigned_implicit_prom_t<T>;
+
+public:
+  template <typename R>
+  struct result
+  {
+    using common_t = decltype(R{} / T{});
+    common_t quotient;
+    common_t remainder;
+  };
+
+  fast_div_mod() = delete;
+
+  _CCCL_NODISCARD _CCCL_HOST_DEVICE explicit fast_div_mod(T divisor) noexcept
+      : _divisor{static_cast<unsigned_t>(divisor)}
+  {
+    using larger_t = larger_unsigned_type_t<T>;
+    _CCCL_ASSERT(divisor > 0, "divisor must be positive");
+    auto udivisor = static_cast<unsigned_t>(divisor);
+    // the following branches are needed to avoid negative shift
+    if (::cuda::std::has_single_bit(udivisor)) // power of two
+    {
+      _shift_right = ::cuda::std::bit_width(udivisor) - 1;
+      return;
+    }
+    else if (sizeof(T) == 8 && divisor == 3)
+    {
+      return;
+    }
+    constexpr int BitSize   = sizeof(T) * CHAR_BIT; // 32
+    constexpr int BitOffset = BitSize / 16; // 2
+    int num_bits            = ::cuda::std::bit_width(udivisor) + 1;
+    _CCCL_ASSERT(static_cast<size_t>(num_bits + BitSize - BitOffset) < sizeof(larger_t) * CHAR_BIT, "overflow error");
+    // without explicit power-of-two check, num_bits needs to replace +1 with !::cuda::std::has_single_bit(udivisor)
+    _multiplier  = static_cast<unsigned_t>(::cuda::ceil_div(larger_t{1} << (num_bits + BitSize - BitOffset), //
+                                                           static_cast<larger_t>(divisor)));
+    _shift_right = num_bits - BitOffset;
+    _CCCL_ASSERT(_multiplier != 0, "overflow error");
+  }
+
+  fast_div_mod(const fast_div_mod&) noexcept = default;
+
+  fast_div_mod(fast_div_mod&&) noexcept = default;
+
+  template <typename R>
+  _CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE result<R> operator()(R dividend) const noexcept
+  {
+    static_assert(supported_integral<R>::value, "unsupported type");
+    using common_t  = decltype(R{} / T{});
+    using ucommon_t = ::cuda::std::make_unsigned_t<common_t>;
+    using result_t  = result<R>;
+    _CCCL_ASSERT(dividend >= 0, "divisor must be non-negative");
+    auto udividend = static_cast<ucommon_t>(dividend);
+    if (_divisor == 1)
+    {
+      return result_t{static_cast<common_t>(dividend), common_t{}};
+    }
+    else if (_divisor > unsigned_t{::cuda::std::numeric_limits<T>::max() / 2})
+    {
+      auto quotient = udividend >= static_cast<ucommon_t>(_divisor);
+      return result_t{static_cast<common_t>(quotient), static_cast<common_t>(udividend - (quotient * _divisor))};
+    }
+    else if (sizeof(T) == 8 && _divisor == 3)
+    {
+      return result_t{static_cast<common_t>(udividend / 3), static_cast<common_t>(udividend % 3)};
+    }
+    auto higher_bits = (_multiplier == 0) ? udividend : multiply_extract_higher_bits<T>(dividend, _multiplier);
+    auto quotient    = higher_bits >> _shift_right;
+    auto remainder   = udividend - (quotient * _divisor);
+    _CCCL_ASSERT(quotient == udividend / _divisor, "wrong quotient");
+    _CCCL_ASSERT(remainder < (ucommon_t) _divisor, "remainder out of range");
+    return result_t{static_cast<common_t>(quotient), static_cast<common_t>(remainder)};
+  }
+
+  template <typename R>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE friend implicit_prom_t<T> operator/(R dividend, fast_div_mod div) noexcept
+  {
+    return div(dividend).quotient;
+  }
+
+  template <typename R>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE friend implicit_prom_t<T> operator%(R dividend, fast_div_mod div) noexcept
+  {
+    return div(dividend).remainder;
+  }
+
+private:
+  unsigned_t _divisor    = 1;
+  unsigned_t _multiplier = 0;
+  unsigned _shift_right  = 0;
+};
+_CCCL_DIAG_POP
+
+} // namespace detail
+
+CUB_NAMESPACE_END
+
+#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
+_CCCL_NV_DIAG_DEFAULT(186)
+#endif // CCCL_ENABLE_DEVICE_ASSERTIONS
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if __cccl_lib_mdspan
+
+#  include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
+
+#  include <cuda/std/array> // std::array
+#  include <cuda/std/cstddef> // size_t
+#  include <cuda/std/mdspan>
+#  include <cuda/std/type_traits> // make_unsigned_t
+#  include <cuda/std/utility> // ::cuda::std::index_sequence
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+// Compute the submdspan size of a given rank
+template <::cuda::std::size_t Rank, typename IndexType, ::cuda::std::size_t Extent0, ::cuda::std::size_t... Extents>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
+sub_size(const ::cuda::std::extents<IndexType, Extent0, Extents...>& ext)
+{
+  ::cuda::std::make_unsigned_t<IndexType> s = 1;
+  for (IndexType i = Rank; i < IndexType{1 + sizeof...(Extents)}; i++) // <- pointless comparison with zero-rank extent
+  {
+    s *= ext.extent(i);
+  }
+  return s;
+}
+
+// avoid pointless comparison of unsigned integer with zero (nvcc 11.x doesn't support nv_diag warning suppression)
+template <::cuda::std::size_t Rank, typename IndexType>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
+sub_size(const ::cuda::std::extents<IndexType>&)
+{
+  return ::cuda::std::make_unsigned_t<IndexType>{1};
+}
+
+// TODO: move to cuda::std
+template <typename IndexType, ::cuda::std::size_t... Extents>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
+size(const ::cuda::std::extents<IndexType, Extents...>& ext)
+{
+  return sub_size<0>(ext);
+}
+
+// precompute modulo/division for each submdspan size (by rank)
+template <typename IndexType, ::cuda::std::size_t... E, ::cuda::std::size_t... Ranks>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
+sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
+{
+  // deduction guides don't work with nvcc 11.x
+  using fast_mod_div_t = fast_div_mod<IndexType>;
+  return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(sub_size<Ranks + 1>(ext))...};
+}
+
+// precompute modulo/division for each mdspan extent
+template <typename IndexType, ::cuda::std::size_t... E, ::cuda::std::size_t... Ranks>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
+extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
+{
+  using fast_mod_div_t = fast_div_mod<IndexType>;
+  return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(ext.extent(Ranks))...};
+}
+
+// GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
+template <int Rank, typename Extents>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_sub_size_static()
+{
+  using index_type = typename Extents::index_type;
+  for (index_type i = Rank; i < Extents::rank(); i++)
+  {
+    if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace detail
+
+CUB_NAMESPACE_END
+
+#endif // if __cccl_lib_mdspan