diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 707a28424e5..e30f5b3ee91 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -17,10 +17,13 @@ #include #include +#include #include #include #include +#include +#include #include #include @@ -75,7 +78,7 @@ void label_segments(InputIterator offsets_begin, // If the offsets array has no more than 2 offset values, there will be at max 1 segment. // In such cases, the output will just be an array of all `0` values (which we already filled). - // We should terminate here, otherwise the `inclusive_scan` call below still do its entire + // We should terminate from here, otherwise the `inclusive_scan` call below still does its entire // computation. That is unnecessary and may be expensive if we have the input offsets defining a // very large segment. if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; } @@ -96,4 +99,91 @@ void label_segments(InputIterator offsets_begin, thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin); } +/** + * @brief Generate segment offsets from groups of identical label values. + * + * Given a pair of iterators accessing to an array containing groups of identical label values, + * generate offsets for segments defined by these label. + * + * Empty segments are also taken into account. If the input label values are discontinuous, the + * segments corresponding to the missing labels will be inferred as empty segments and their offsets + * will also be generated. + * + * Note that the caller is responsible to make sure the output range for offsets have the correct + * size, which is the maximum label value plus two (i.e., `size = *(labels_end - 1) + 2`). + * Otherwise, the result is undefined. + * + * @code{.pseudo} + * Examples: + * + * labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ] + * output = [ 0, 4, 6, 6, 6, 10 ] + * + * labels = [ 0, 0, 0, 0, 0, 1, 1 ] + * output = [ 0, 5, 7 ] + * @endcode + * + * @param labels_begin The beginning of the labels that define segments. + * @param labels_end The end of the labels that define segments. + * @param offsets_begin The beginning of the output offset range. + * @param offsets_end The end of the output offset range. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +template +void labels_to_offsets(InputIterator labels_begin, + InputIterator labels_end, + OutputIterator offsets_begin, + OutputIterator offsets_end, + rmm::cuda_stream_view stream) +{ + // Always fill the entire output array with `0` value regardless of the input. + using OutputType = typename thrust::iterator_value::type; + thrust::uninitialized_fill(rmm::exec_policy(stream), offsets_begin, offsets_end, OutputType{0}); + + // If there is not any label value, we will have zero segment or all empty segments. We should + // terminate from here because: + // - If we have zero segment, the output array is empty thus `num_segments` computed below is + // wrong and may cascade to undefined behavior if we continue. + // - If we have all empty segments, the output offset values will be all `0`, which we already + // filled above. If we continue, the `exclusive_scan` call below still does its entire + // computation. That is unnecessary and may be expensive if we have the input labels defining + // a very large number of segments. + if (thrust::distance(labels_begin, labels_end) == 0) { return; } + + auto const num_segments = thrust::distance(offsets_begin, offsets_end) - 1; + + //================================================================================ + // Let's consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. + + // This stores the unique label values. + // Given the example above, we will have this array containing [0, 1, 4]. + auto list_indices = rmm::device_uvector(num_segments, stream); + + // Stores the non-zero segment sizes. + // Given the example above, we will have this array containing [4, 2, 4]. + auto list_sizes = rmm::device_uvector(num_segments, stream); + + // Count the numbers of labels in the each segment. + auto const end = thrust::reduce_by_key(rmm::exec_policy(stream), + labels_begin, // keys + labels_end, // keys + thrust::make_constant_iterator(1), + list_indices.begin(), // output unique label values + list_sizes.begin()); // count for each label + auto const num_non_empty_segments = thrust::distance(list_indices.begin(), end.first); + + // Scatter segment sizes into the end position of their corresponding segment indices. + // Given the example above, we scatter [4, 2, 4] by the scatter map [0, 1, 4], resulting + // output = [4, 2, 0, 0, 4, 0]. + thrust::scatter(rmm::exec_policy(stream), + list_sizes.begin(), + list_sizes.begin() + num_non_empty_segments, + list_indices.begin(), + offsets_begin); + + // Generate offsets from sizes. + // Given the example above, the final output is [0, 4, 6, 6, 6, 10]. + thrust::exclusive_scan(rmm::exec_policy(stream), offsets_begin, offsets_end, offsets_begin); +} + } // namespace cudf::detail diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 929af866b6d..c04cc71c7db 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -40,12 +40,7 @@ #include #include #include -#include #include -#include -#include -#include -#include #include @@ -436,72 +431,6 @@ std::vector> get_unique_entries_and_list_indices( ->release(); } -/** - * @brief Generate list offsets from entry list indices for the final result lists column(s). - * - * @param num_lists The number of lists. - * @param entries_list_indices The mapping from list entries to their (1-based) list indices. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device resource used to allocate memory. - */ -std::unique_ptr generate_output_offsets(size_type num_lists, - column_view const& entries_list_indices, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Let consider an example: - // Given the original offsets of the input lists column is [0, 4, 5, 6, 7, 10, 11, 13]. - // The original entries_list_indices is [1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 6, 7, 7], and after - // extracting unique entries we have the entries_list_indices becomes [1, 1, 1, 4, 5, 5, 5, 7, 7] - // and num_lists is 7. These are the input to this function. - // - // Through extracting unique list entries, one entry in the list index 1 has been removed (first - // list, as we are using 1-based list index), and entries in the lists with indices {3, 3, 6} have - // been removed completely. - - // This variable stores the (1-based) list indices of the unique entries but only one index value - // per non-empty list. Given the example above, we will have this array hold the values - // [1, 4, 5, 7]. - auto list_indices = rmm::device_uvector(num_lists, stream); - - // Stores the non-zero numbers of unique entries per list. - // Given the example above, we will have this array contains the values [3, 1, 3, 2] - auto list_sizes = rmm::device_uvector(num_lists, stream); - - // Count the numbers of unique entries for each non-empty list. - auto const end = thrust::reduce_by_key(rmm::exec_policy(stream), - entries_list_indices.template begin(), - entries_list_indices.template end(), - thrust::make_constant_iterator(1), - list_indices.begin(), - list_sizes.begin()); - auto const num_non_empty_lists = thrust::distance(list_indices.begin(), end.first); - - // The output offsets for the output lists column(s). - auto new_offsets = rmm::device_uvector(num_lists + 1, stream, mr); - - // The new offsets need to be filled with 0 value first. - thrust::uninitialized_fill_n( - rmm::exec_policy(stream), new_offsets.begin(), num_lists + 1, offset_type{0}); - - // Scatter non-zero sizes of the output lists into the correct positions. - // Given the example above, we will have new_offsets = [0, 3, 0, 0, 1, 3, 0, 2] - thrust::scatter(rmm::exec_policy(stream), - list_sizes.begin(), - list_sizes.begin() + num_non_empty_lists, - list_indices.begin(), - new_offsets.begin()); - - // Generate offsets from sizes. - // Given the example above, we will have new_offsets = [0, 3, 3, 3, 4, 7, 7, 9] - thrust::exclusive_scan( - rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin()); - - // Done. Hope that your head didn't explode after reading till this point. - return std::make_unique( - data_type{type_to_id()}, num_lists + 1, new_offsets.release()); -} - /** * @brief Common execution code called by all public `drop_list_duplicates` APIs. */ @@ -594,11 +523,19 @@ std::pair, std::unique_ptr> drop_list_duplicates mr); // Generate offsets for the output lists column(s). - auto output_offsets = generate_output_offsets( - keys.size(), - unique_entries_and_list_indices.back()->view(), // unique entries' list indices - stream, - mr); + auto output_offsets = [&] { + auto out_offsets = make_numeric_column( + data_type{type_to_id()}, keys.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto const offsets = out_offsets->mutable_view(); + auto const labels = + unique_entries_and_list_indices.back()->view(); // unique entries' list indices + cudf::detail::labels_to_offsets(labels.template begin(), + labels.template end(), + offsets.template begin(), + offsets.template end(), + stream); + return out_offsets; + }(); // If the values lists column is not given, its corresponding output will be nullptr. auto out_values =