diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh
index ed703e26c9d..4154123973a 100644
--- a/cub/cub/detail/choose_offset.cuh
+++ b/cub/cub/detail/choose_offset.cuh
@@ -29,8 +29,10 @@
 
 #include <cub/config.cuh>
 
+#include <cuda/std/iterator>
+#include <cuda/std/type_traits>
+
 #include <cstdint>
-#include <type_traits>
 
 CUB_NAMESPACE_BEGIN
 
@@ -45,18 +47,28 @@ template <typename NumItemsT>
 struct ChooseOffsetT
 {
   // NumItemsT must be an integral type (but not bool).
-  static_assert(
-    std::is_integral<NumItemsT>::value &&
-      !std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
-    "NumItemsT must be an integral type, but not bool");
+  static_assert(std::is_integral<NumItemsT>::value &&
+                  !std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
+                "NumItemsT must be an integral type, but not bool");
 
   // Unsigned integer type for global offsets.
-  using Type = typename std::conditional<sizeof(NumItemsT) <= 4,
-                                         std::uint32_t,
-                                         unsigned long long>::type;
+  using Type =
+    typename std::conditional<sizeof(NumItemsT) <= 4, std::uint32_t, unsigned long long>::type;
 };
 
+/**
+ * common_iterator_value sets member type to the common_type of
+ * value_type for all argument types. used to get OffsetT in
+ * DeviceSegmentedReduce.
+ */
+template <typename... Iter>
+struct common_iterator_value
+{
+  using type = ::cuda::std::__common_type_t<::cuda::std::__iter_value_type<Iter>...>;
+};
+template <typename... Iter>
+using common_iterator_value_t = typename common_iterator_value<Iter...>::type;
+
 } // namespace detail
 
 CUB_NAMESPACE_END
-
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index 52706f2b133..a776ac0f9e7 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,34 +27,34 @@
  ******************************************************************************/
 
 /**
- * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations 
- *       for computing a batched reduction across multiple sequences of data 
+ * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations
+ *       for computing a batched reduction across multiple sequences of data
  *       items residing within device-accessible memory.
  */
 
 #pragma once
 
-#include <iterator>
-
 #include <cub/config.cuh>
+#include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_reduce.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_type.cuh>
 
-CUB_NAMESPACE_BEGIN
+#include <iterator>
 
+CUB_NAMESPACE_BEGIN
 
 /**
- * @brief DeviceSegmentedReduce provides device-wide, parallel operations for 
- *        computing a reduction across multiple sequences of data items 
+ * @brief DeviceSegmentedReduce provides device-wide, parallel operations for
+ *        computing a reduction across multiple sequences of data items
  *        residing within device-accessible memory. ![](reduce_logo.png)
  * @ingroup SegmentedModule
  *
  * @par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)">*reduction*</a> 
- * (or *fold*) uses a binary combining operator to compute a single aggregate 
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)">*reduction*</a>
+ * (or *fold*) uses a binary combining operator to compute a single aggregate
  * from a sequence of input elements.
  *
  * @par Usage Considerations
@@ -64,7 +64,7 @@ CUB_NAMESPACE_BEGIN
 struct DeviceSegmentedReduce
 {
   /**
-   * @brief Computes a device-wide segmented reduction using the specified 
+   * @brief Computes a device-wide segmented reduction using the specified
    *        binary `reduction_op` functor.
    *
    * @par
@@ -78,19 +78,19 @@ struct DeviceSegmentedReduce
    *   `segment_offsets` (of length `num_segments + 1`) can be aliased
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
-   * - Let `s` be in `[0, num_segments)`. The range 
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   * - Let `s` be in `[0, num_segments)`. The range
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
    *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates a custom min-reduction of a device 
+   * The code snippet below illustrates a custom min-reduction of a device
    * vector of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
    * // CustomMin functor
@@ -103,7 +103,7 @@ struct DeviceSegmentedReduce
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int          num_segments;   // e.g., 3
    * int          *d_offsets;     // e.g., [0, 3, 3, 7]
@@ -131,116 +131,114 @@ struct DeviceSegmentedReduce
    * // d_out <-- [6, INT_MAX, 0]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Output iterator type for recording the reduced 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced
    *   aggregate \iterator
    *
-   * @tparam BeginOffsetIteratorT 
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @tparam ReductionOp          
-   *   **[inferred]** Binary reduction functor type having member 
+   * @tparam ReductionOpT
+   *   **[inferred]** Binary reduction functor type having member
    *   `T operator()(const T &a, const T &b)`
    *
-   * @tparam T                    
-   *   **[inferred]** Data element type that is convertible to the `value` type 
+   * @tparam T
+   *   **[inferred]** Data element type that is convertible to the `value` type
    *   of `InputIteratorT`
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of \p d_temp_storage allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] reduction_op  
-   *   Binary reduction functor 
+   * @param[in] reduction_op
+   *   Binary reduction functor
    *
-   * @param[in] initial_value  
+   * @param[in] initial_value
    *   Initial value of the reduction for each segment
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT,
-            typename ReductionOp,
+            typename ReductionOpT,
             typename T>
-  CUB_RUNTIME_FUNCTION static cudaError_t
-  Reduce(void *d_temp_storage,
-         size_t &temp_storage_bytes,
-         InputIteratorT d_in,
-         OutputIteratorT d_out,
-         int num_segments,
-         BeginOffsetIteratorT d_begin_offsets,
-         EndOffsetIteratorT d_end_offsets,
-         ReductionOp reduction_op,
-         T initial_value,
-         cudaStream_t stream = 0)
+  CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage,
+                                                 size_t &temp_storage_bytes,
+                                                 InputIteratorT d_in,
+                                                 OutputIteratorT d_out,
+                                                 int num_segments,
+                                                 BeginOffsetIteratorT d_begin_offsets,
+                                                 EndOffsetIteratorT d_end_offsets,
+                                                 ReductionOpT reduction_op,
+                                                 T initial_value,
+                                                 cudaStream_t stream = 0)
   {
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Integer type for global offsets
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     return DispatchSegmentedReduce<InputIteratorT,
                                    OutputIteratorT,
                                    BeginOffsetIteratorT,
                                    EndOffsetIteratorT,
                                    OffsetT,
-                                   ReductionOp>::Dispatch(d_temp_storage,
-                                                          temp_storage_bytes,
-                                                          d_in,
-                                                          d_out,
-                                                          num_segments,
-                                                          d_begin_offsets,
-                                                          d_end_offsets,
-                                                          reduction_op,
-                                                          initial_value,
-                                                          stream);
+                                   ReductionOpT>::Dispatch(d_temp_storage,
+                                                           temp_storage_bytes,
+                                                           d_in,
+                                                           d_out,
+                                                           num_segments,
+                                                           d_begin_offsets,
+                                                           d_end_offsets,
+                                                           reduction_op,
+                                                           initial_value,
+                                                           stream);
   }
 
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT,
-            typename ReductionOp,
+            typename ReductionOpT,
             typename T>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Reduce(void *d_temp_storage,
          size_t &temp_storage_bytes,
          InputIteratorT d_in,
@@ -248,7 +246,7 @@ struct DeviceSegmentedReduce
          int num_segments,
          BeginOffsetIteratorT d_begin_offsets,
          EndOffsetIteratorT d_end_offsets,
-         ReductionOp reduction_op,
+         ReductionOpT reduction_op,
          T initial_value,
          cudaStream_t stream,
          bool debug_synchronous)
@@ -259,7 +257,7 @@ struct DeviceSegmentedReduce
                   OutputIteratorT,
                   BeginOffsetIteratorT,
                   EndOffsetIteratorT,
-                  ReductionOp,
+                  ReductionOpT,
                   T>(d_temp_storage,
                      temp_storage_bytes,
                      d_in,
@@ -273,7 +271,7 @@ struct DeviceSegmentedReduce
   }
 
   /**
-   * @brief Computes a device-wide segmented sum using the addition (`+`) 
+   * @brief Computes a device-wide segmented sum using the addition (`+`)
    *        operator.
    *
    * @par
@@ -283,22 +281,22 @@ struct DeviceSegmentedReduce
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
    * - Does not support `+` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range 
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   * - Let `s` be in `[0, num_segments)`. The range
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
    *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the sum reduction of a device vector of 
+   * The code snippet below illustrates the sum reduction of a device vector of
    * `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int num_segments;   // e.g., 3
    * int *d_offsets;     // e.g., [0, 3, 3, 7]
@@ -324,102 +322,98 @@ struct DeviceSegmentedReduce
    * // d_out <-- [21, 0, 17]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced aggregate
    *   \iterator
    *
-   * @tparam BeginOffsetIteratorT 
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
    * @param[in] stream
-   *   **[optional]</b> CUDA stream to launch kernels within.  
+   *   **[optional]</b> CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_RUNTIME_FUNCTION static cudaError_t
-  Sum(void *d_temp_storage,
-      size_t &temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_segments,
-      BeginOffsetIteratorT d_begin_offsets,
-      EndOffsetIteratorT d_end_offsets,
-      cudaStream_t stream = 0)
+  CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
+                                              size_t &temp_storage_bytes,
+                                              InputIteratorT d_in,
+                                              OutputIteratorT d_out,
+                                              int num_segments,
+                                              BeginOffsetIteratorT d_begin_offsets,
+                                              EndOffsetIteratorT d_end_offsets,
+                                              cudaStream_t stream = 0)
   {
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Integer type for global offsets
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     // The output value type
     using OutputT =
-      cub::detail::non_void_value_t<OutputIteratorT,
-                                    cub::detail::value_t<InputIteratorT>>;
+      cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>;
 
-    return DispatchSegmentedReduce<
-      InputIteratorT,
-      OutputIteratorT,
-      BeginOffsetIteratorT,
-      EndOffsetIteratorT,
-      OffsetT,
-      cub::Sum>::Dispatch(d_temp_storage,
-                          temp_storage_bytes,
-                          d_in,
-                          d_out,
-                          num_segments,
-                          d_begin_offsets,
-                          d_end_offsets,
-                          cub::Sum(),
-                          OutputT(), // zero-initialize
-                          stream);
+    return DispatchSegmentedReduce<InputIteratorT,
+                                   OutputIteratorT,
+                                   BeginOffsetIteratorT,
+                                   EndOffsetIteratorT,
+                                   OffsetT,
+                                   cub::Sum>::Dispatch(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       d_in,
+                                                       d_out,
+                                                       num_segments,
+                                                       d_begin_offsets,
+                                                       d_end_offsets,
+                                                       cub::Sum(),
+                                                       OutputT(), // zero-initialize
+                                                       stream);
   }
 
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Sum(void *d_temp_storage,
       size_t &temp_storage_bytes,
       InputIteratorT d_in,
@@ -432,47 +426,45 @@ struct DeviceSegmentedReduce
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return Sum<InputIteratorT,
-               OutputIteratorT,
-               BeginOffsetIteratorT,
-               EndOffsetIteratorT>(d_temp_storage,
-                                   temp_storage_bytes,
-                                   d_in,
-                                   d_out,
-                                   num_segments,
-                                   d_begin_offsets,
-                                   d_end_offsets,
-                                   stream);
+    return Sum<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
   }
 
   /**
-   * @brief Computes a device-wide segmented minimum using the less-than 
+   * @brief Computes a device-wide segmented minimum using the less-than
    *        (`<`) operator.
    *
    * @par
-   * - Uses `std::numeric_limits<T>::max()` as the initial value of the 
+   * - Uses `std::numeric_limits<T>::max()` as the initial value of the
    *   reduction for each segment.
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
    *   specified as `segment_offsets + 1`).
    * - Does not support `<` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range 
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   * - Let `s` be in `[0, num_segments)`. The range
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
    *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the min-reduction of a device vector of 
+   * The code snippet below illustrates the min-reduction of a device vector of
    * `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int num_segments;   // e.g., 3
    * int *d_offsets;     // e.g., [0, 3, 3, 7]
@@ -498,72 +490,71 @@ struct DeviceSegmentedReduce
    * // d_out <-- [6, INT_MAX, 0]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Output iterator type for recording the reduced 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced
    *   aggregate \iterator
    *
-   * @tparam BeginOffsetIteratorT 
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_RUNTIME_FUNCTION static cudaError_t
-  Min(void *d_temp_storage,
-      size_t &temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_segments,
-      BeginOffsetIteratorT d_begin_offsets,
-      EndOffsetIteratorT d_end_offsets,
-      cudaStream_t stream = 0)
+  CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage,
+                                              size_t &temp_storage_bytes,
+                                              InputIteratorT d_in,
+                                              OutputIteratorT d_out,
+                                              int num_segments,
+                                              BeginOffsetIteratorT d_begin_offsets,
+                                              EndOffsetIteratorT d_end_offsets,
+                                              cudaStream_t stream = 0)
   {
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Integer type for global offsets
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     // The input value type
     using InputT = cub::detail::value_t<InputIteratorT>;
@@ -593,8 +584,7 @@ struct DeviceSegmentedReduce
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Min(void *d_temp_storage,
       size_t &temp_storage_bytes,
       InputIteratorT d_in,
@@ -607,53 +597,51 @@ struct DeviceSegmentedReduce
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return Min<InputIteratorT,
-               OutputIteratorT,
-               BeginOffsetIteratorT,
-               EndOffsetIteratorT>(d_temp_storage,
-                                   temp_storage_bytes,
-                                   d_in,
-                                   d_out,
-                                   num_segments,
-                                   d_begin_offsets,
-                                   d_end_offsets,
-                                   stream);
+    return Min<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
   }
 
   /**
-   * @brief Finds the first device-wide minimum in each segment using the 
-   *        less-than ('<') operator, also returning the in-segment index of 
+   * @brief Finds the first device-wide minimum in each segment using the
+   *        less-than ('<') operator, also returning the in-segment index of
    *        that item.
    *
    * @par
-   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
    *   (assuming the value type of `d_in` is `T`)
-   *   - The minimum of the *i*<sup>th</sup> segment is written to 
-   *     `d_out[i].value` and its offset in that segment is written to 
+   *   - The minimum of the *i*<sup>th</sup> segment is written to
+   *     `d_out[i].value` and its offset in that segment is written to
    *     `d_out[i].key`.
-   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for 
+   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for
    *     zero-length inputs
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
    *   is specified as `segment_offsets + 1`).
    * - Does not support `<` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range 
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   * - Let `s` be in `[0, num_segments)`. The range
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
    *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the argmin-reduction of a device vector 
+   * The code snippet below illustrates the argmin-reduction of a device vector
    * of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int                      num_segments;   // e.g., 3
    * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
@@ -679,80 +667,80 @@ struct DeviceSegmentedReduce
    * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input items 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input items
    *   (of some type `T`) \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced aggregate
    *   (having value type `KeyValuePair<int, T>`) \iterator
    *
    * @tparam BeginOffsetIteratorT
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the
    *   *i*<sup>th</sup> is considered empty.
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_RUNTIME_FUNCTION static cudaError_t
-  ArgMin(void *d_temp_storage,
-         size_t &temp_storage_bytes,
-         InputIteratorT d_in,
-         OutputIteratorT d_out,
-         int num_segments,
-         BeginOffsetIteratorT d_begin_offsets,
-         EndOffsetIteratorT d_end_offsets,
-         cudaStream_t stream = 0)
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
+                                                 size_t &temp_storage_bytes,
+                                                 InputIteratorT d_in,
+                                                 OutputIteratorT d_out,
+                                                 int num_segments,
+                                                 BeginOffsetIteratorT d_begin_offsets,
+                                                 EndOffsetIteratorT d_end_offsets,
+                                                 cudaStream_t stream = 0)
   {
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Integer type for global offsets
+    // Using common iterator value type is a breaking change, see:
+    // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
+    using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     // The input type
     using InputValueT = cub::detail::value_t<InputIteratorT>;
 
     // The output tuple type
     using OutputTupleT =
-      cub::detail::non_void_value_t<OutputIteratorT,
-                                    KeyValuePair<OffsetT, InputValueT>>;
+      cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
 
     // The output value type
     using OutputValueT = typename OutputTupleT::Value;
@@ -762,14 +750,13 @@ struct DeviceSegmentedReduce
     using InitT = detail::reduce::empty_problem_init_t<AccumT>;
 
     // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-    using ArgIndexInputIteratorT =
-      ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
 
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())}; 
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
 
     return DispatchSegmentedReduce<ArgIndexInputIteratorT,
                                    OutputIteratorT,
@@ -794,8 +781,7 @@ struct DeviceSegmentedReduce
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   ArgMin(void *d_temp_storage,
          size_t &temp_storage_bytes,
          InputIteratorT d_in,
@@ -808,47 +794,45 @@ struct DeviceSegmentedReduce
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return ArgMin<InputIteratorT,
-                  OutputIteratorT,
-                  BeginOffsetIteratorT,
-                  EndOffsetIteratorT>(d_temp_storage,
-                                      temp_storage_bytes,
-                                      d_in,
-                                      d_out,
-                                      num_segments,
-                                      d_begin_offsets,
-                                      d_end_offsets,
-                                      stream);
+    return ArgMin<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
   }
 
   /**
-   * @brief Computes a device-wide segmented maximum using the greater-than 
+   * @brief Computes a device-wide segmented maximum using the greater-than
    *        (`>`) operator.
    *
    * @par
-   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the 
+   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the
    *   reduction.
    * - When input a contiguous sequence of segments, a single sequence
    *   `segment_offsets` (of length `num_segments + 1`) can be aliased
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
    * - Does not support `>` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range 
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   * - Let `s` be in `[0, num_segments)`. The range
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
    *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the max-reduction of a device vector 
+   * The code snippet below illustrates the max-reduction of a device vector
    * of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int num_segments;   // e.g., 3
    * int *d_offsets;     // e.g., [0, 3, 3, 7]
@@ -874,72 +858,71 @@ struct DeviceSegmentedReduce
    * // d_out <-- [8, INT_MIN, 9]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Output iterator type for recording the reduced 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced
    *   aggregate \iterator
    *
-   * @tparam BeginOffsetIteratorT 
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT 
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_RUNTIME_FUNCTION static cudaError_t
-  Max(void *d_temp_storage,
-      size_t &temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_segments,
-      BeginOffsetIteratorT d_begin_offsets,
-      EndOffsetIteratorT d_end_offsets,
-      cudaStream_t stream = 0)
+  CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
+                                              size_t &temp_storage_bytes,
+                                              InputIteratorT d_in,
+                                              OutputIteratorT d_out,
+                                              int num_segments,
+                                              BeginOffsetIteratorT d_begin_offsets,
+                                              EndOffsetIteratorT d_end_offsets,
+                                              cudaStream_t stream = 0)
   {
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Integer type for global offsets
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     // The input value type
     using InputT = cub::detail::value_t<InputIteratorT>;
@@ -969,8 +952,7 @@ struct DeviceSegmentedReduce
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Max(void *d_temp_storage,
       size_t &temp_storage_bytes,
       InputIteratorT d_in,
@@ -983,53 +965,51 @@ struct DeviceSegmentedReduce
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return Max<InputIteratorT,
-               OutputIteratorT,
-               BeginOffsetIteratorT,
-               EndOffsetIteratorT>(d_temp_storage,
-                                   temp_storage_bytes,
-                                   d_in,
-                                   d_out,
-                                   num_segments,
-                                   d_begin_offsets,
-                                   d_end_offsets,
-                                   stream);
+    return Max<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
   }
 
   /**
-   * @brief Finds the first device-wide maximum in each segment using the 
-   *        greater-than ('>') operator, also returning the in-segment index of 
+   * @brief Finds the first device-wide maximum in each segment using the
+   *        greater-than ('>') operator, also returning the in-segment index of
    *        that item
    *
    * @par
-   * - The output value type of `d_out` is `cub::KeyValuePair<int, T>` 
+   * - The output value type of `d_out` is `cub::KeyValuePair<int, T>`
    *   (assuming the value type of `d_in` is `T`)
-   *   - The maximum of the *i*<sup>th</sup> segment is written to 
-   *     `d_out[i].value` and its offset in that segment is written to 
+   *   - The maximum of the *i*<sup>th</sup> segment is written to
+   *     `d_out[i].value` and its offset in that segment is written to
    *     `d_out[i].key`.
-   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for 
+   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for
    *     zero-length inputs
    * - When input a contiguous sequence of segments, a single sequence
    *   `segment_offsets` (of length `num_segments + 1`) can be aliased
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
    * - Does not support `>` operators that are non-commutative.
-   * - Let `s` be in `[0, num_segments)`. The range 
-   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   * - Let `s` be in `[0, num_segments)`. The range
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not
    *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the argmax-reduction of a device vector 
+   * The code snippet below illustrates the argmax-reduction of a device vector
    * of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_reduce.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int                      num_segments;   // e.g., 3
    * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
@@ -1055,80 +1035,80 @@ struct DeviceSegmentedReduce
    * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input items 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input items
    *   (of some type `T`) \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced aggregate
    *   (having value type `KeyValuePair<int, T>`) \iterator
    *
-   * @tparam BeginOffsetIteratorT 
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_RUNTIME_FUNCTION static cudaError_t
-  ArgMax(void *d_temp_storage,
-         size_t &temp_storage_bytes,
-         InputIteratorT d_in,
-         OutputIteratorT d_out,
-         int num_segments,
-         BeginOffsetIteratorT d_begin_offsets,
-         EndOffsetIteratorT d_end_offsets,
-         cudaStream_t stream = 0)
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
+                                                 size_t &temp_storage_bytes,
+                                                 InputIteratorT d_in,
+                                                 OutputIteratorT d_out,
+                                                 int num_segments,
+                                                 BeginOffsetIteratorT d_begin_offsets,
+                                                 EndOffsetIteratorT d_end_offsets,
+                                                 cudaStream_t stream = 0)
   {
-    // Signed integer type for global offsets
-    using OffsetT = int;
+    // Integer type for global offsets
+    // Using common iterator value type is a breaking change, see:
+    // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
+    using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
 
     // The input type
     using InputValueT = cub::detail::value_t<InputIteratorT>;
 
     // The output tuple type
     using OutputTupleT =
-      cub::detail::non_void_value_t<OutputIteratorT,
-                                    KeyValuePair<OffsetT, InputValueT>>;
+      cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
 
     using AccumT = OutputTupleT;
 
@@ -1138,14 +1118,13 @@ struct DeviceSegmentedReduce
     using OutputValueT = typename OutputTupleT::Value;
 
     // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-    using ArgIndexInputIteratorT =
-      ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
 
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())}; 
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
 
     return DispatchSegmentedReduce<ArgIndexInputIteratorT,
                                    OutputIteratorT,
@@ -1155,23 +1134,22 @@ struct DeviceSegmentedReduce
                                    cub::ArgMax,
                                    InitT,
                                    AccumT>::Dispatch(d_temp_storage,
-                                                          temp_storage_bytes,
-                                                          d_indexed_in,
-                                                          d_out,
-                                                          num_segments,
-                                                          d_begin_offsets,
-                                                          d_end_offsets,
-                                                          cub::ArgMax(),
-                                                          initial_value,
-                                                          stream);
+                                                     temp_storage_bytes,
+                                                     d_indexed_in,
+                                                     d_out,
+                                                     num_segments,
+                                                     d_begin_offsets,
+                                                     d_end_offsets,
+                                                     cub::ArgMax(),
+                                                     initial_value,
+                                                     stream);
   }
 
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
             typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   ArgMax(void *d_temp_storage,
          size_t &temp_storage_bytes,
          InputIteratorT d_in,
@@ -1184,20 +1162,16 @@ struct DeviceSegmentedReduce
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
-    return ArgMax<InputIteratorT,
-                  OutputIteratorT,
-                  BeginOffsetIteratorT,
-                  EndOffsetIteratorT>(d_temp_storage,
-                                      temp_storage_bytes,
-                                      d_in,
-                                      d_out,
-                                      num_segments,
-                                      d_begin_offsets,
-                                      d_end_offsets,
-                                      stream);
+    return ArgMax<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
   }
 };
 
 CUB_NAMESPACE_END
-
-
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 2dbcdc76fdd..39a777c8e3d 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,16 +27,13 @@
  ******************************************************************************/
 
 /**
- * @file cub::DeviceReduce provides device-wide, parallel operations for 
- *       computing a reduction across a sequence of data items residing within 
+ * @file cub::DeviceReduce provides device-wide, parallel operations for
+ *       computing a reduction across a sequence of data items residing within
  *       device-accessible memory.
  */
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
-
 #include <cub/agent/agent_reduce.cuh>
 #include <cub/config.cuh>
 #include <cub/grid/grid_even_share.cuh>
@@ -50,6 +47,10 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <iterator>
+
+#include <stdio.h>
+
 CUB_NAMESPACE_BEGIN
 
 namespace detail
@@ -110,42 +111,42 @@ __host__ __device__ void finalize_and_store_aggregate(OutputIteratorT d_out,
  *****************************************************************************/
 
 /**
- * @brief Reduce region kernel entry point (multi-block). Computes privatized 
+ * @brief Reduce region kernel entry point (multi-block). Computes privatized
  *        reductions, one per thread block.
  *
- * @tparam ChainedPolicyT 
+ * @tparam ChainedPolicyT
  *   Chained tuning policy
  *
  * @tparam InputIteratorT
  *   Random-access input iterator type for reading input items \iterator
- * 
+ *
  * @tparam OffsetT
  *   Signed integer type for global offsets
- * 
+ *
  * @tparam ReductionOpT
- *   Binary reduction functor type having member 
+ *   Binary reduction functor type having member
  *   `auto operator()(const T &a, const U &b)`
- * 
+ *
  * @tparam InitT
  *   Initial value type
- * 
+ *
  * @tparam AccumT
  *   Accumulator type
- * 
- * @param[in] d_in 
+ *
+ * @param[in] d_in
  *   Pointer to the input sequence of data items
- * 
- * @param[out] d_out 
+ *
+ * @param[out] d_out
  *   Pointer to the output aggregate
- * 
- * @param[in] num_items 
+ *
+ * @param[in] num_items
  *   Total number of input data items
- * 
- * @param[in] even_share 
- *   Even-share descriptor for mapping an equal number of tiles onto each 
+ *
+ * @param[in] even_share
+ *   Even-share descriptor for mapping an equal number of tiles onto each
  *   thread block
- * 
- * @param[in] reduction_op 
+ *
+ * @param[in] reduction_op
  *   Binary reduction functor
  */
 template <typename ChainedPolicyT,
@@ -153,28 +154,26 @@ template <typename ChainedPolicyT,
           typename OffsetT,
           typename ReductionOpT,
           typename AccumT>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceKernel(InputIteratorT d_in,
-                                                       AccumT *d_out,
-                                                       OffsetT num_items,
-                                                       GridEvenShare<OffsetT> even_share,
-                                                       ReductionOpT reduction_op)
+CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+  void DeviceReduceKernel(InputIteratorT d_in,
+                          AccumT *d_out,
+                          OffsetT num_items,
+                          GridEvenShare<OffsetT> even_share,
+                          ReductionOpT reduction_op)
 {
   // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-                InputIteratorT,
-                AccumT*,
-                OffsetT,
-                ReductionOpT,
-                AccumT>;
+  using AgentReduceT = AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                                   InputIteratorT,
+                                   AccumT *,
+                                   OffsetT,
+                                   ReductionOpT,
+                                   AccumT>;
 
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
 
   // Consume input tiles
-  AccumT block_aggregate =
-    AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
 
   // Output result
   if (threadIdx.x == 0)
@@ -184,8 +183,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
 }
 
 /**
- * @brief Reduce a single tile kernel entry point (single-block). Can be used 
- *        to aggregate privatized thread block reductions from a previous 
+ * @brief Reduce a single tile kernel entry point (single-block). Can be used
+ *        to aggregate privatized thread block reductions from a previous
  *        multi-block reduction pass.
  *
  * @tparam ChainedPolicyT
@@ -196,33 +195,33 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
  *
  * @tparam OutputIteratorT
  *   Output iterator type for recording the reduced aggregate \iterator
- * 
+ *
  * @tparam OffsetT
  *   Signed integer type for global offsets
- * 
+ *
  * @tparam ReductionOpT
- *   Binary reduction functor type having member 
+ *   Binary reduction functor type having member
  *   `T operator()(const T &a, const U &b)`
- * 
+ *
  * @tparam InitT
  *   Initial value type
  *
  * @tparam AccumT
- *   Accumulator type 
+ *   Accumulator type
  *
- * @param[in] d_in 
+ * @param[in] d_in
  *   Pointer to the input sequence of data items
  *
- * @param[out] d_out 
+ * @param[out] d_out
  *   Pointer to the output aggregate
  *
- * @param[in] num_items 
+ * @param[in] num_items
  *   Total number of input data items
  *
- * @param[in] reduction_op 
+ * @param[in] reduction_op
  *   Binary reduction functor
  *
- * @param[in] init 
+ * @param[in] init
  *   The initial value of the reduction
  */
 template <typename ChainedPolicyT,
@@ -232,21 +231,20 @@ template <typename ChainedPolicyT,
           typename ReductionOpT,
           typename InitT,
           typename AccumT>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) //
-  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceSingleTileKernel(InputIteratorT d_in,
-                                                                 OutputIteratorT d_out,
-                                                                 OffsetT num_items,
-                                                                 ReductionOpT reduction_op,
-                                                                 InitT init)
+CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+  void DeviceReduceSingleTileKernel(InputIteratorT d_in,
+                                    OutputIteratorT d_out,
+                                    OffsetT num_items,
+                                    ReductionOpT reduction_op,
+                                    InitT init)
 {
   // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
-                InputIteratorT,
-                OutputIteratorT,
-                OffsetT,
-                ReductionOpT,
-                AccumT>;
+  using AgentReduceT = AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+                                   InputIteratorT,
+                                   OutputIteratorT,
+                                   OffsetT,
+                                   ReductionOpT,
+                                   AccumT>;
 
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
@@ -263,8 +261,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE
   }
 
   // Consume input tiles
-  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op)
-                             .ConsumeRange(OffsetT(0), num_items);
+  AccumT block_aggregate =
+    AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(OffsetT(0), num_items);
 
   // Output result
   if (threadIdx.x == 0)
@@ -281,14 +279,11 @@ __device__ __forceinline__ void NormalizeReductionOutput(T & /*val*/,
 {}
 
 /// Normalize input iterator to segment offset (specialized for arg-index)
-template <typename KeyValuePairT,
-          typename OffsetT,
-          typename WrappedIteratorT,
-          typename OutputValueT>
-__device__ __forceinline__ void NormalizeReductionOutput(
-  KeyValuePairT &val,
-  OffsetT base_offset,
-  ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__ void
+NormalizeReductionOutput(KeyValuePairT &val,
+                         OffsetT base_offset,
+                         ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
 {
   val.key -= base_offset;
 }
@@ -297,7 +292,7 @@ __device__ __forceinline__ void NormalizeReductionOutput(
  * Segmented reduction (one block per segment)
  * @tparam ChainedPolicyT
  *   Chained tuning policy
- * 
+ *
  * @tparam InputIteratorT
  *   Random-access input iterator type for reading input items \iterator
  *
@@ -305,48 +300,48 @@ __device__ __forceinline__ void NormalizeReductionOutput(
  *   Output iterator type for recording the reduced aggregate \iterator
  *
  * @tparam BeginOffsetIteratorT
- *   Random-access input iterator type for reading segment beginning offsets 
+ *   Random-access input iterator type for reading segment beginning offsets
  *   \iterator
  *
  * @tparam EndOffsetIteratorT
- *   Random-access input iterator type for reading segment ending offsets 
+ *   Random-access input iterator type for reading segment ending offsets
  *   \iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
  *
  * @tparam ReductionOpT
- *   Binary reduction functor type having member 
+ *   Binary reduction functor type having member
  *   `T operator()(const T &a, const U &b)`
  *
  * @tparam InitT
  *   Initial value type
  *
- * @param[in] d_in 
+ * @param[in] d_in
  *   Pointer to the input sequence of data items
  *
- * @param[out] d_out 
+ * @param[out] d_out
  *   Pointer to the output aggregate
  *
- * @param[in] d_begin_offsets 
- *   Random-access input iterator to the sequence of beginning offsets of 
- *   length `num_segments`, such that `d_begin_offsets[i]` is the first element 
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of
+ *   length `num_segments`, such that `d_begin_offsets[i]` is the first element
  *   of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
  *
- * @param[in] d_end_offsets 
- *   Random-access input iterator to the sequence of ending offsets of length 
- *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
- *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
- *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+ *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+ *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
  *   considered empty.
  *
- * @param[in] num_segments 
+ * @param[in] num_segments
  *   The number of segments that comprise the sorting data
  *
- * @param[in] reduction_op 
+ * @param[in] reduction_op
  *   Binary reduction functor
  *
- * @param[in] init 
+ * @param[in] init
  *   The initial value of the reduction
  */
 template <typename ChainedPolicyT,
@@ -358,8 +353,7 @@ template <typename ChainedPolicyT,
           typename ReductionOpT,
           typename InitT,
           typename AccumT>
-__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-  CUB_DETAIL_KERNEL_ATTRIBUTES
+CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
   void DeviceSegmentedReduceKernel(InputIteratorT d_in,
                                    OutputIteratorT d_out,
                                    BeginOffsetIteratorT d_begin_offsets,
@@ -369,13 +363,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
                                    InitT init)
 {
   // Thread block type for reducing input tiles
-  using AgentReduceT =
-    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-                InputIteratorT,
-                OutputIteratorT,
-                OffsetT,
-                ReductionOpT,
-                AccumT>;
+  using AgentReduceT = AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                                   InputIteratorT,
+                                   OutputIteratorT,
+                                   OffsetT,
+                                   ReductionOpT,
+                                   AccumT>;
 
   // Shared memory storage
   __shared__ typename AgentReduceT::TempStorage temp_storage;
@@ -394,8 +387,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
   }
 
   // Consume input tiles
-  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op)
-                             .ConsumeRange(segment_begin, segment_end);
+  AccumT block_aggregate =
+    AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end);
 
   // Normalize as needed
   NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
@@ -421,13 +414,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
  *   Signed integer type for global offsets
  *
  * ReductionOpT
- *   Binary reduction functor type having member 
+ *   Binary reduction functor type having member
  *   `auto operator()(const T &a, const U &b)`
  */
-template <
-    typename AccumT,            
-    typename OffsetT,          
-    typename ReductionOpT>    
+template <typename AccumT, typename OffsetT, typename ReductionOpT>
 struct DeviceReducePolicy
 {
   //---------------------------------------------------------------------------
@@ -504,14 +494,12 @@ struct DeviceReducePolicy
   using MaxPolicy = Policy600;
 };
 
-
-
 /******************************************************************************
  * Single-problem dispatch
  *****************************************************************************/
 
 /**
- * @brief Utility class for dispatching the appropriately-tuned kernels for 
+ * @brief Utility class for dispatching the appropriately-tuned kernels for
  *        device-wide reduction
  *
  * @tparam InputIteratorT
@@ -524,27 +512,21 @@ struct DeviceReducePolicy
  *   Signed integer type for global offsets
  *
  * @tparam ReductionOpT
- *   Binary reduction functor type having member 
+ *   Binary reduction functor type having member
  *   `auto operator()(const T &a, const U &b)`
  *
- * @tparam InitT 
+ * @tparam InitT
  *   Initial value type
  */
-template <
-  typename InputIteratorT,
-  typename OutputIteratorT,
-  typename OffsetT,
-  typename ReductionOpT,
-  typename InitT = 
-    cub::detail::non_void_value_t<
-      OutputIteratorT, 
-      cub::detail::value_t<InputIteratorT>>,
-  typename AccumT = 
-    detail::accumulator_t<
-      ReductionOpT, 
-      InitT, 
-      cub::detail::value_t<InputIteratorT>>,
-  typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT =
+            cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
+          typename AccumT =
+            detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
+          typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
 struct DispatchReduce : SelectedPolicy
 {
   //---------------------------------------------------------------------------
@@ -602,29 +584,28 @@ struct DispatchReduce : SelectedPolicy
       , init(init)
       , stream(stream)
       , ptx_version(ptx_version)
-  {} 
+  {}
 
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION __forceinline__
-  DispatchReduce(void* d_temp_storage,
-                 size_t &temp_storage_bytes,
-                 InputIteratorT d_in,
-                 OutputIteratorT d_out,
-                 OffsetT num_items,
-                 ReductionOpT reduction_op,
-                 InitT init,
-                 cudaStream_t stream,
-                 bool debug_synchronous,
-                 int ptx_version)
-    : d_temp_storage(d_temp_storage)
-    , temp_storage_bytes(temp_storage_bytes)
-    , d_in(d_in)
-    , d_out(d_out)
-    , num_items(num_items)
-    , reduction_op(reduction_op)
-    , init(init)
-    , stream(stream)
-    , ptx_version(ptx_version)
+  CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage,
+                                                      size_t &temp_storage_bytes,
+                                                      InputIteratorT d_in,
+                                                      OutputIteratorT d_out,
+                                                      OffsetT num_items,
+                                                      ReductionOpT reduction_op,
+                                                      InitT init,
+                                                      cudaStream_t stream,
+                                                      bool debug_synchronous,
+                                                      int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_items(num_items)
+      , reduction_op(reduction_op)
+      , init(init)
+      , stream(stream)
+      , ptx_version(ptx_version)
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
@@ -643,7 +624,7 @@ struct DispatchReduce : SelectedPolicy
    *   Function type of cub::DeviceReduceSingleTileKernel
    *
    * @param[in] single_tile_kernel
-   *   Kernel function pointer to parameterization of 
+   *   Kernel function pointer to parameterization of
    *   cub::DeviceReduceSingleTileKernel
    */
   template <typename ActivePolicyT, typename SingleTileKernelT>
@@ -661,18 +642,21 @@ struct DispatchReduce : SelectedPolicy
         break;
       }
 
-      // Log single_reduce_sweep_kernel configuration
-      #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+// Log single_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
               "%d items per thread\n",
               ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
               (long long)stream,
               ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-      #endif
+#endif
 
       // Invoke single_reduce_sweep_kernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-        1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
+        1,
+        ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+        0,
+        stream)
         .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init);
 
       // Check for failure to launch
@@ -711,16 +695,13 @@ struct DispatchReduce : SelectedPolicy
    * @param[in] reduce_kernel
    *   Kernel function pointer to parameterization of cub::DeviceReduceKernel
    *
-   * @param[in] single_tile_kernel 
-   *   Kernel function pointer to parameterization of 
+   * @param[in] single_tile_kernel
+   *   Kernel function pointer to parameterization of
    *   cub::DeviceReduceSingleTileKernel
    */
-  template <typename ActivePolicyT,
-            typename ReduceKernelT,
-            typename SingleTileKernelT>
+  template <typename ActivePolicyT, typename ReduceKernelT, typename SingleTileKernelT>
   CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
-  InvokePasses(ReduceKernelT reduce_kernel,
-               SingleTileKernelT single_tile_kernel)
+  InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
   {
     cudaError error = cudaSuccess;
     do
@@ -786,8 +767,8 @@ struct DispatchReduce : SelectedPolicy
       // Get grid size for device_reduce_sweep_kernel
       int reduce_grid_size = even_share.grid_size;
 
-      // Log device_reduce_sweep_kernel configuration
-      #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+// Log device_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items "
               "per thread, %d SM occupancy\n",
               reduce_grid_size,
@@ -795,7 +776,7 @@ struct DispatchReduce : SelectedPolicy
               (long long)stream,
               ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
               reduce_config.sm_occupancy);
-      #endif
+#endif
 
       // Invoke DeviceReduceKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -803,12 +784,7 @@ struct DispatchReduce : SelectedPolicy
         ActivePolicyT::ReducePolicy::BLOCK_THREADS,
         0,
         stream)
-        .doit(reduce_kernel,
-              d_in,
-              d_block_reductions,
-              num_items,
-              even_share,
-              reduction_op);
+        .doit(reduce_kernel, d_in, d_block_reductions, num_items, even_share, reduction_op);
 
       // Check for failure to launch
       error = CubDebug(cudaPeekAtLastError());
@@ -824,14 +800,14 @@ struct DispatchReduce : SelectedPolicy
         break;
       }
 
-      // Log single_reduce_sweep_kernel configuration
-      #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+// Log single_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
               "%d items per thread\n",
               ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
               (long long)stream,
               ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-      #endif
+#endif
 
       // Invoke DeviceReduceSingleTileKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -876,35 +852,33 @@ struct DispatchReduce : SelectedPolicy
     typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
 
     // Force kernel code-generation in all compiler passes
-    if (num_items <= (SingleTilePolicyT::BLOCK_THREADS *
-                      SingleTilePolicyT::ITEMS_PER_THREAD))
+    if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
     {
       // Small, single tile size
-      return InvokeSingleTile<ActivePolicyT>(
-        DeviceReduceSingleTileKernel<MaxPolicyT,
-                                     InputIteratorT,
-                                     OutputIteratorT,
-                                     OffsetT,
-                                     ReductionOpT,
-                                     InitT,
-                                     AccumT>);
+      return InvokeSingleTile<ActivePolicyT>(DeviceReduceSingleTileKernel<MaxPolicyT,
+                                                                          InputIteratorT,
+                                                                          OutputIteratorT,
+                                                                          OffsetT,
+                                                                          ReductionOpT,
+                                                                          InitT,
+                                                                          AccumT>);
     }
     else
     {
       // Regular size
-      return InvokePasses<ActivePolicyT>(
-        DeviceReduceKernel<typename DispatchReduce::MaxPolicy,
-                           InputIteratorT,
-                           OffsetT,
-                           ReductionOpT,
-                           AccumT>,
-        DeviceReduceSingleTileKernel<MaxPolicyT,
-                                     AccumT *,
-                                     OutputIteratorT,
-                                     int, // Always used with int offsets
-                                     ReductionOpT,
-                                     InitT,
-                                     AccumT>);
+      return InvokePasses<ActivePolicyT>(DeviceReduceKernel<typename DispatchReduce::MaxPolicy,
+                                                            InputIteratorT,
+                                                            OffsetT,
+                                                            ReductionOpT,
+                                                            AccumT>,
+                                         DeviceReduceSingleTileKernel<MaxPolicyT,
+                                                                      AccumT *,
+                                                                      OutputIteratorT,
+                                                                      int, // Always used with int
+                                                                           // offsets
+                                                                      ReductionOpT,
+                                                                      InitT,
+                                                                      AccumT>);
     }
   }
 
@@ -915,42 +889,41 @@ struct DispatchReduce : SelectedPolicy
   /**
    * @brief Internal dispatch routine for computing a device-wide reduction
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in 
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out 
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_items 
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] reduction_op 
+   * @param[in] reduction_op
    *   Binary reduction functor
    *
-   * @param[in] init 
+   * @param[in] init
    *   The initial value of the reduction
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
-  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
-  Dispatch(void *d_temp_storage,
-           size_t &temp_storage_bytes,
-           InputIteratorT d_in,
-           OutputIteratorT d_out,
-           OffsetT num_items,
-           ReductionOpT reduction_op,
-           InitT init,
-           cudaStream_t stream)
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
+                                                                   size_t &temp_storage_bytes,
+                                                                   InputIteratorT d_in,
+                                                                   OutputIteratorT d_out,
+                                                                   OffsetT num_items,
+                                                                   ReductionOpT reduction_op,
+                                                                   InitT init,
+                                                                   cudaStream_t stream)
   {
     typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
 
@@ -959,7 +932,7 @@ struct DispatchReduce : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      error = CubDebug(PtxVersion(ptx_version));
+      error           = CubDebug(PtxVersion(ptx_version));
       if (cudaSuccess != error)
       {
         break;
@@ -988,16 +961,15 @@ struct DispatchReduce : SelectedPolicy
   }
 
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
-  Dispatch(void *d_temp_storage,
-           size_t &temp_storage_bytes,
-           InputIteratorT d_in,
-           OutputIteratorT d_out,
-           OffsetT num_items,
-           ReductionOpT reduction_op,
-           InitT init,
-           cudaStream_t stream,
-           bool debug_synchronous)
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
+                                                                   size_t &temp_storage_bytes,
+                                                                   InputIteratorT d_in,
+                                                                   OutputIteratorT d_out,
+                                                                   OffsetT num_items,
+                                                                   ReductionOpT reduction_op,
+                                                                   InitT init,
+                                                                   cudaStream_t stream,
+                                                                   bool debug_synchronous)
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
 
@@ -1012,14 +984,12 @@ struct DispatchReduce : SelectedPolicy
   }
 };
 
-
-
 /******************************************************************************
  * Segmented dispatch
  *****************************************************************************/
 
 /**
- * @brief Utility class for dispatching the appropriately-tuned kernels for 
+ * @brief Utility class for dispatching the appropriately-tuned kernels for
  *        device-wide reduction
  *
  * @tparam InputIteratorT
@@ -1029,40 +999,34 @@ struct DispatchReduce : SelectedPolicy
  *   Output iterator type for recording the reduced aggregate \iterator
  *
  * @tparam BeginOffsetIteratorT
- *   Random-access input iterator type for reading segment beginning offsets 
+ *   Random-access input iterator type for reading segment beginning offsets
  *   \iterator
  *
  * @tparam EndOffsetIteratorT
- *   Random-access input iterator type for reading segment ending offsets 
+ *   Random-access input iterator type for reading segment ending offsets
  *   \iterator
  *
  * @tparam OffsetT
  *   Signed integer type for global offsets
  *
  * @tparam ReductionOpT
- *   Binary reduction functor type having member 
+ *   Binary reduction functor type having member
  *   `auto operator()(const T &a, const U &b)`
  *
- * @tparam InitT 
+ * @tparam InitT
  *   value type
  */
-template <
-  typename InputIteratorT,
-  typename OutputIteratorT,
-  typename BeginOffsetIteratorT,
-  typename EndOffsetIteratorT,
-  typename OffsetT,
-  typename ReductionOpT,
-  typename InitT = 
-    cub::detail::non_void_value_t<
-      OutputIteratorT, 
-      cub::detail::value_t<InputIteratorT>>,
-  typename AccumT = 
-    detail::accumulator_t<
-      ReductionOpT, 
-      InitT, 
-      cub::detail::value_t<InputIteratorT>>,
-  typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT =
+            cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
+          typename AccumT =
+            detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
+          typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
 struct DispatchSegmentedReduce : SelectedPolicy
 {
   //---------------------------------------------------------------------------
@@ -1084,7 +1048,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
   OutputIteratorT d_out;
 
   /// The number of segments that comprise the sorting data
-  OffsetT num_segments;
+  int num_segments;
 
   /// Random-access input iterator to the sequence of beginning offsets of
   /// length `num_segments`, such that `d_begin_offsets[i]` is the first
@@ -1115,18 +1079,17 @@ struct DispatchSegmentedReduce : SelectedPolicy
   //---------------------------------------------------------------------------
 
   /// Constructor
-  CUB_RUNTIME_FUNCTION __forceinline__
-  DispatchSegmentedReduce(void *d_temp_storage,
-                          size_t &temp_storage_bytes,
-                          InputIteratorT d_in,
-                          OutputIteratorT d_out,
-                          OffsetT num_segments,
-                          BeginOffsetIteratorT d_begin_offsets,
-                          EndOffsetIteratorT d_end_offsets,
-                          ReductionOpT reduction_op,
-                          InitT init,
-                          cudaStream_t stream,
-                          int ptx_version)
+  CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage,
+                                                               size_t &temp_storage_bytes,
+                                                               InputIteratorT d_in,
+                                                               OutputIteratorT d_out,
+                                                               int num_segments,
+                                                               BeginOffsetIteratorT d_begin_offsets,
+                                                               EndOffsetIteratorT d_end_offsets,
+                                                               ReductionOpT reduction_op,
+                                                               InitT init,
+                                                               cudaStream_t stream,
+                                                               int ptx_version)
       : d_temp_storage(d_temp_storage)
       , temp_storage_bytes(temp_storage_bytes)
       , d_in(d_in)
@@ -1141,19 +1104,18 @@ struct DispatchSegmentedReduce : SelectedPolicy
   {}
 
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION __forceinline__
-  DispatchSegmentedReduce(void *d_temp_storage,
-                          size_t &temp_storage_bytes,
-                          InputIteratorT d_in,
-                          OutputIteratorT d_out,
-                          OffsetT num_segments,
-                          BeginOffsetIteratorT d_begin_offsets,
-                          EndOffsetIteratorT d_end_offsets,
-                          ReductionOpT reduction_op,
-                          InitT init,
-                          cudaStream_t stream,
-                          bool debug_synchronous,
-                          int ptx_version)
+  CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage,
+                                                               size_t &temp_storage_bytes,
+                                                               InputIteratorT d_in,
+                                                               OutputIteratorT d_out,
+                                                               int num_segments,
+                                                               BeginOffsetIteratorT d_begin_offsets,
+                                                               EndOffsetIteratorT d_end_offsets,
+                                                               ReductionOpT reduction_op,
+                                                               InitT init,
+                                                               cudaStream_t stream,
+                                                               bool debug_synchronous,
+                                                               int ptx_version)
       : d_temp_storage(d_temp_storage)
       , temp_storage_bytes(temp_storage_bytes)
       , d_in(d_in)
@@ -1183,7 +1145,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
    *   Function type of cub::DeviceSegmentedReduceKernel
    *
    * @param[in] segmented_reduce_kernel
-   *   Kernel function pointer to parameterization of 
+   *   Kernel function pointer to parameterization of
    *   cub::DeviceSegmentedReduceKernel
    */
   template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
@@ -1211,8 +1173,8 @@ struct DispatchSegmentedReduce : SelectedPolicy
         break;
       }
 
-      // Log device_reduce_sweep_kernel configuration
-      #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+// Log device_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
               "%d items per thread, %d SM occupancy\n",
               num_segments,
@@ -1220,7 +1182,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
               (long long)stream,
               ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
               segmented_reduce_config.sm_occupancy);
-      #endif
+#endif
 
       // Invoke DeviceReduceKernel
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
@@ -1262,16 +1224,15 @@ struct DispatchSegmentedReduce : SelectedPolicy
     typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
 
     // Force kernel code-generation in all compiler passes
-    return InvokePasses<ActivePolicyT>(
-      DeviceSegmentedReduceKernel<MaxPolicyT,
-                                  InputIteratorT,
-                                  OutputIteratorT,
-                                  BeginOffsetIteratorT,
-                                  EndOffsetIteratorT,
-                                  OffsetT,
-                                  ReductionOpT,
-                                  InitT,
-                                  AccumT>);
+    return InvokePasses<ActivePolicyT>(DeviceSegmentedReduceKernel<MaxPolicyT,
+                                                                   InputIteratorT,
+                                                                   OutputIteratorT,
+                                                                   BeginOffsetIteratorT,
+                                                                   EndOffsetIteratorT,
+                                                                   OffsetT,
+                                                                   ReductionOpT,
+                                                                   InitT,
+                                                                   AccumT>);
   }
 
   //---------------------------------------------------------------------------
@@ -1282,8 +1243,8 @@ struct DispatchSegmentedReduce : SelectedPolicy
    * @brief Internal dispatch routine for computing a device-wide reduction
    *
    * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
    * @param[in,out] temp_storage_bytes
@@ -1299,16 +1260,16 @@ struct DispatchSegmentedReduce : SelectedPolicy
    *   The number of segments that comprise the sorting data
    *
    * @param[in] d_begin_offsets
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
    * @param[in] d_end_offsets
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
    * @param[in] reduction_op
@@ -1318,7 +1279,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
    *   The initial value of the reduction
    *
    * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within.  
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
@@ -1346,7 +1307,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      error = CubDebug(PtxVersion(ptx_version));
+      error           = CubDebug(PtxVersion(ptx_version));
       if (cudaSuccess != error)
       {
         break;
@@ -1405,6 +1366,4 @@ struct DispatchSegmentedReduce : SelectedPolicy
   }
 };
 
-
 CUB_NAMESPACE_END
-
diff --git a/cub/test/c2h/generators.cu b/cub/test/c2h/generators.cu
index d86e5c73a03..02f46f578f8 100644
--- a/cub/test/c2h/generators.cu
+++ b/cub/test/c2h/generators.cu
@@ -452,10 +452,19 @@ gen_uniform_offsets(seed_t seed, T total_elements, T min_segment_size, T max_seg
   return segment_offsets;
 }
 
+template thrust::device_vector<int32_t> gen_uniform_offsets(seed_t seed,
+                                                             int32_t total_elements,
+                                                             int32_t min_segment_size,
+                                                             int32_t max_segment_size);
+
 template thrust::device_vector<uint32_t> gen_uniform_offsets(seed_t seed,
                                                              uint32_t total_elements,
                                                              uint32_t min_segment_size,
                                                              uint32_t max_segment_size);
+template thrust::device_vector<int64_t> gen_uniform_offsets(seed_t seed,
+                                                             int64_t total_elements,
+                                                             int64_t min_segment_size,
+                                                             int64_t max_segment_size);
 template thrust::device_vector<uint64_t> gen_uniform_offsets(seed_t seed,
                                                              uint64_t total_elements,
                                                              uint64_t min_segment_size,
diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu
index 98e6f5a2d01..7ced7fd29d6 100644
--- a/cub/test/catch2_test_device_segmented_reduce.cu
+++ b/cub/test/catch2_test_device_segmented_reduce.cu
@@ -78,14 +78,17 @@ type_pair<custom_t>
 // clang-format on
 #endif
 
+using offsets = c2h::type_list<std::int32_t, std::uint32_t>;
+
 CUB_TEST("Device reduce works with all device interfaces",
          "[segmented][reduce][device]",
-         full_type_list)
+         full_type_list,
+         offsets)
 {
-  using params   = params_t<TestType>;
-  using item_t   = typename params::item_t;
-  using output_t = typename params::output_t;
-  using offset_t = uint32_t;
+  using type_pair_t = typename c2h::get<0, TestType>;
+  using input_t      = typename type_pair_t::input_t;
+  using output_t    = typename type_pair_t::output_t;
+  using offset_t    = typename c2h::get<1, TestType>;
 
   constexpr int min_items = 1;
   constexpr int max_items = 1000000;
@@ -115,7 +118,7 @@ CUB_TEST("Device reduce works with all device interfaces",
   auto d_offsets_it           = thrust::raw_pointer_cast(segment_offsets.data());
 
   // Generate input data
-  thrust::device_vector<item_t> in_items(num_items);
+  thrust::device_vector<input_t> in_items(num_items);
   c2h::gen(CUB_SEED(2), in_items);
   auto d_in_it = thrust::raw_pointer_cast(in_items.data());
 
@@ -127,7 +130,7 @@ CUB_TEST("Device reduce works with all device interfaces",
     auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
 
     // Prepare verification data
-    using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
+    using accum_t = cub::detail::accumulator_t<op_t, output_t, input_t>;
     thrust::host_vector<output_t> expected_result(num_segments);
     compute_segmented_problem_reference(in_items,
                                         segment_offsets,
@@ -157,7 +160,7 @@ CUB_TEST("Device reduce works with all device interfaces",
   SECTION("sum")
   {
     using op_t    = cub::Sum;
-    using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
+    using accum_t = cub::detail::accumulator_t<op_t, output_t, input_t>;
 
     // Prepare verification data
     thrust::host_vector<output_t> expected_result(num_segments);
@@ -186,7 +189,7 @@ CUB_TEST("Device reduce works with all device interfaces",
     compute_segmented_problem_reference(in_items,
                                         segment_offsets,
                                         op_t{},
-                                        cub::NumericTraits<item_t>::Max(),
+                                        cub::NumericTraits<input_t>::Max(),
                                         expected_result.begin());
 
     // Run test
@@ -211,7 +214,7 @@ CUB_TEST("Device reduce works with all device interfaces",
     compute_segmented_problem_reference(in_items,
                                         segment_offsets,
                                         op_t{},
-                                        cub::NumericTraits<item_t>::Lowest(),
+                                        cub::NumericTraits<input_t>::Lowest(),
                                         expected_result.begin());
 
     // Run test
@@ -236,7 +239,6 @@ CUB_TEST("Device reduce works with all device interfaces",
     compute_segmented_argmax_reference(in_items, segment_offsets, expected_result.begin());
 
     // Run test
-    using result_t = cub::KeyValuePair<int, output_t>;
     thrust::device_vector<result_t> out_result(num_segments);
     device_segmented_arg_max(d_in_it,
                              thrust::raw_pointer_cast(out_result.data()),
@@ -253,7 +255,7 @@ CUB_TEST("Device reduce works with all device interfaces",
     using result_t = cub::KeyValuePair<int, output_t>;
 
     // Prepare verification data
-    thrust::host_vector<item_t> host_items(in_items);
+    thrust::host_vector<input_t> host_items(in_items);
     thrust::host_vector<result_t> expected_result(num_segments);
     compute_segmented_argmin_reference(in_items, segment_offsets, expected_result.begin());
 
diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu
index 39fae9191a2..8215cf6fe1f 100644
--- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu
+++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu
@@ -51,15 +51,17 @@ DECLARE_CDP_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum);
 // List of types to test
 using custom_t           = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
 using iterator_type_list = c2h::type_list<type_pair<custom_t>, type_pair<std::int64_t>>;
+using offsets            = c2h::type_list<std::int32_t, std::uint32_t>;
 
 CUB_TEST("Device segmented reduce works with fancy input iterators",
          "[reduce][device]",
-         iterator_type_list)
+         iterator_type_list,
+         offsets)
 {
-  using params   = params_t<TestType>;
-  using item_t   = typename params::item_t;
-  using output_t = typename params::output_t;
-  using offset_t = uint32_t;
+  using type_pair_t = typename c2h::get<0, TestType>;
+  using item_t      = typename type_pair_t::input_t;
+  using output_t    = typename type_pair_t::output_t;
+  using offset_t    = typename c2h::get<1, TestType>;
 
   constexpr int min_items = 1;
   constexpr int max_items = 1000000;
diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu
new file mode 100644
index 00000000000..f39a0d50112
--- /dev/null
+++ b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu
@@ -0,0 +1,96 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cstdint>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2/catch.hpp"
+#include "catch2_test_cdp_helper.h"
+#include "catch2_test_helper.h"
+
+DECLARE_CDP_WRAPPER(cub::DeviceSegmentedReduce::Reduce, device_segmented_reduce);
+DECLARE_CDP_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum);
+
+// %PARAM% TEST_CDP cdp 0:1
+
+// List of types to test
+using offsets = c2h::type_list<std::ptrdiff_t, std::size_t>;
+
+CUB_TEST("Device segmented reduce works with fancy input iterators and 64-bit offsets",
+         "[reduce][device]",
+         offsets)
+{
+  using offset_t       = typename c2h::get<0, TestType>;
+  using op_t           = cub::Sum;
+
+  constexpr offset_t offset_zero           = 0;
+  constexpr offset_t offset_one            = 1;
+  constexpr offset_t iterator_value        = 2;
+  constexpr offset_t min_items_per_segment = offset_one << 31;
+  constexpr offset_t max_items_per_segment = offset_one << 33;
+
+  constexpr int num_segments = 2;
+
+  // generate individual segment lengths and store cumulative sum in segment_offsets
+  const offset_t num_items_in_first_segment =
+    GENERATE_COPY(take(2, random(min_items_per_segment, max_items_per_segment)));
+  const offset_t num_items_in_second_segment =
+    GENERATE_COPY(take(2, random(min_items_per_segment, max_items_per_segment)));
+  thrust::device_vector<offset_t> segment_offsets = {offset_zero,
+                                                     num_items_in_first_segment,
+                                                     num_items_in_first_segment +
+                                                       num_items_in_second_segment};
+
+  // store expected result and initialize device output container
+  thrust::host_vector<offset_t> expected_result = {iterator_value * num_items_in_first_segment,
+                                                   iterator_value * num_items_in_second_segment};
+  thrust::device_vector<offset_t> device_result(num_segments);
+
+  // prepare device iterators
+  auto in_it        = thrust::make_constant_iterator(iterator_value);
+  auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data());
+  auto d_out_it     = thrust::raw_pointer_cast(device_result.data());
+
+  // reduce
+  device_segmented_reduce(in_it,
+                          d_out_it,
+                          num_segments,
+                          d_offsets_it,
+                          d_offsets_it + 1,
+                          op_t{},
+                          offset_t{});
+
+  // verify result
+  REQUIRE(expected_result == device_result);
+}