diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9678fc39128..6f7ef6cc681 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -476,6 +476,7 @@ add_library( src/sort/segmented_sort.cu src/sort/sort_column.cu src/sort/sort.cu + src/sort/stable_segmented_sort.cu src/sort/stable_sort_column.cu src/sort/stable_sort.cu src/stream_compaction/apply_boolean_mask.cu diff --git a/cpp/benchmarks/sort/segmented_sort.cpp b/cpp/benchmarks/sort/segmented_sort.cpp index 7162269853c..e3459291caf 100644 --- a/cpp/benchmarks/sort/segmented_sort.cpp +++ b/cpp/benchmarks/sort/segmented_sort.cpp @@ -28,6 +28,7 @@ void nvbench_segmented_sort(nvbench::state& state) { cudf::rmm_pool_raii pool_raii; + auto const stable = static_cast(state.get_int64("stable")); auto const dtype = cudf::type_to_id(); auto const size_bytes = static_cast(state.get_int64("size_bytes")); auto const null_freq = state.get_float64("null_frequency"); @@ -50,12 +51,16 @@ void nvbench_segmented_sort(nvbench::state& state) state.add_global_memory_writes(rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = cudf::segmented_sorted_order(*input, *segments); + if (stable) + cudf::stable_segmented_sorted_order(*input, *segments); + else + cudf::segmented_sorted_order(*input, *segments); }); } NVBENCH_BENCH(nvbench_segmented_sort) .set_name("segmented_sort") + .add_int64_axis("stable", {0, 1}) .add_int64_power_of_two_axis("size_bytes", {16, 18, 20, 22, 24, 28}) .add_float64_axis("null_frequency", {0, 0.1}) .add_int64_axis("row_width", {16, 128, 1024}); diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu index 685d8aa3ec1..38d008c120c 100644 --- a/cpp/src/sort/segmented_sort.cu +++ b/cpp/src/sort/segmented_sort.cu @@ -14,12 +14,11 @@ * limitations under the License. */ -#include -#include -#include +#include "segmented_sort_impl.cuh" + #include -#include #include +#include #include #include @@ -28,165 +27,9 @@ #include #include -#include - namespace cudf { namespace detail { -namespace { -/** - * @brief The enum specifying which sorting method to use (stable or unstable). - */ -enum class sort_method { STABLE, UNSTABLE }; - -/** - * @brief Functor performs faster segmented sort on eligible columns - */ -struct column_fast_sort_fn { - /** - * @brief Run-time check for faster segmented sort on an eligible column - * - * Fast segmented sort can handle integral types including - * decimal types if dispatch_storage_type is used but it does not support int128. - */ - static bool is_fast_sort_supported(column_view const& col) - { - return !col.has_nulls() and - (cudf::is_integral(col.type()) || - (cudf::is_fixed_point(col.type()) and (col.type().id() != type_id::DECIMAL128))); - } - - /** - * @brief Compile-time check for supporting fast segmented sort for a specific type - * - * The dispatch_storage_type means we can check for integral types to - * include fixed-point types but the CUB limitation means we need to exclude int128. - */ - template - static constexpr bool is_fast_sort_supported() - { - return cudf::is_integral() and !std::is_same_v<__int128, T>; - } - - template - void fast_sort(column_view const& input, - column_view const& segment_offsets, - mutable_column_view& indices, - bool ascending, - rmm::cuda_stream_view stream) - { - // CUB's segmented sort functions cannot accept iterators. - // We create a temporary column here for it to use. - auto temp_col = - cudf::detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream); - mutable_column_view output_view = temp_col->mutable_view(); - - // DeviceSegmentedSort is faster then DeviceSegmentedRadixSort at this time - auto fast_sort_impl = [stream](bool ascending, [[maybe_unused]] auto&&... args) { - rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes = 0; - if (ascending) { - cub::DeviceSegmentedSort::SortPairs( - d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); - d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; - cub::DeviceSegmentedSort::SortPairs( - d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); - } else { - cub::DeviceSegmentedSort::SortPairsDescending( - d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); - d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; - cub::DeviceSegmentedSort::SortPairsDescending( - d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); - } - }; - - fast_sort_impl(ascending, - input.begin(), - output_view.begin(), - indices.begin(), - indices.begin(), - input.size(), - segment_offsets.size() - 1, - segment_offsets.begin(), - segment_offsets.begin() + 1, - stream.value()); - } - - template ())> - void operator()(column_view const& input, - column_view const& segment_offsets, - mutable_column_view& indices, - bool ascending, - rmm::cuda_stream_view stream) - { - fast_sort(input, segment_offsets, indices, ascending, stream); - } - - template ())> - void operator()( - column_view const&, column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view) - { - CUDF_FAIL("Column type cannot be used with fast-sort function"); - } -}; - -/** - * @brief Performs faster sort on eligible columns - * - * Check the `is_fast_sort_supported()==true` on the input column before using this function. - * - * @param input Column to sort - * @param segment_offsets Identifies segments to sort within - * @param column_order Sort ascending or descending - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - */ -std::unique_ptr fast_segmented_sorted_order(column_view const& input, - column_view const& segment_offsets, - order const& column_order, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Unfortunately, CUB's segmented sort functions cannot accept iterators. - // We have to build a pre-filled sequence of indices as input. - auto sorted_indices = - cudf::detail::sequence(input.size(), numeric_scalar{0}, stream, mr); - auto indices_view = sorted_indices->mutable_view(); - - cudf::type_dispatcher(input.type(), - column_fast_sort_fn{}, - input, - segment_offsets, - indices_view, - column_order == order::ASCENDING, - stream); - return sorted_indices; -} - -/** - * @brief Builds indices to identify segments to sort - * - * The segments are added to the input table-view keys so they - * are lexicographically sorted within the segmented groups. - * - * ``` - * Example 1: - * num_rows = 10 - * offsets = {0, 3, 7, 10} - * segment-indices -> { 3,3,3, 7,7,7,7, 10,10,10 } - * ``` - * - * ``` - * Example 2: (offsets do not cover all indices) - * num_rows = 10 - * offsets = {3, 7} - * segment-indices -> { 0,1,2, 7,7,7,7, 8,9,10 } - * ``` - * - * @param num_rows Total number of rows in the input keys to sort - * @param offsets The offsets identifying the segments - * @param stream CUDA stream used for device memory operations and kernel launches - */ rmm::device_uvector get_segment_indices(size_type num_rows, column_view const& offsets, rmm::cuda_stream_view stream) @@ -209,117 +52,6 @@ rmm::device_uvector get_segment_indices(size_type num_rows, return segment_ids; } -std::unique_ptr segmented_sorted_order_common( - table_view const& keys, - column_view const& segment_offsets, - std::vector const& column_order, - std::vector const& null_precedence, - sort_method sorting, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (keys.num_rows() == 0 || keys.num_columns() == 0) { - return cudf::make_empty_column(type_to_id()); - } - - CUDF_EXPECTS(segment_offsets.type() == data_type(type_to_id()), - "segment offsets should be size_type"); - - if (not column_order.empty()) { - CUDF_EXPECTS(static_cast(keys.num_columns()) == column_order.size(), - "Mismatch between number of columns and column order."); - } - - if (not null_precedence.empty()) { - CUDF_EXPECTS(static_cast(keys.num_columns()) == null_precedence.size(), - "Mismatch between number of columns and null_precedence size."); - } - - // the average row size for which to prefer fast sort - constexpr cudf::size_type MAX_AVG_LIST_SIZE_FOR_FAST_SORT{100}; - // the maximum row count for which to prefer fast sort - constexpr cudf::size_type MAX_LIST_SIZE_FOR_FAST_SORT{1 << 18}; - - // fast-path for single column sort: - // - single-column table - // - not stable-sort - // - no nulls and allowable fixed-width type - // - size and width are limited -- based on benchmark results - if (keys.num_columns() == 1 and sorting == sort_method::UNSTABLE and - column_fast_sort_fn::is_fast_sort_supported(keys.column(0)) and - (segment_offsets.size() > 0) and - (((keys.num_rows() / segment_offsets.size()) < MAX_AVG_LIST_SIZE_FOR_FAST_SORT) or - (keys.num_rows() < MAX_LIST_SIZE_FOR_FAST_SORT))) { - auto const col_order = column_order.empty() ? order::ASCENDING : column_order.front(); - return fast_segmented_sorted_order(keys.column(0), segment_offsets, col_order, stream, mr); - } - - // Get segment id of each element in all segments. - auto segment_ids = get_segment_indices(keys.num_rows(), segment_offsets, stream); - - // insert segment id before all columns. - std::vector keys_with_segid; - keys_with_segid.reserve(keys.num_columns() + 1); - keys_with_segid.push_back( - column_view(data_type(type_to_id()), segment_ids.size(), segment_ids.data())); - keys_with_segid.insert(keys_with_segid.end(), keys.begin(), keys.end()); - auto segid_keys = table_view(keys_with_segid); - - auto prepend_default = [](auto const& vector, auto default_value) { - if (vector.empty()) return vector; - std::remove_cv_t> pre_vector; - pre_vector.reserve(pre_vector.size() + 1); - pre_vector.push_back(default_value); - pre_vector.insert(pre_vector.end(), vector.begin(), vector.end()); - return pre_vector; - }; - auto child_column_order = prepend_default(column_order, order::ASCENDING); - auto child_null_precedence = prepend_default(null_precedence, null_order::AFTER); - - // return sorted order of child columns - return sorting == sort_method::STABLE - ? detail::stable_sorted_order( - segid_keys, child_column_order, child_null_precedence, stream, mr) - : detail::sorted_order( - segid_keys, child_column_order, child_null_precedence, stream, mr); -} - -std::unique_ptr segmented_sort_by_key_common(table_view const& values, - table_view const& keys, - column_view const& segment_offsets, - std::vector const& column_order, - std::vector const& null_precedence, - sort_method sorting, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(values.num_rows() == keys.num_rows(), - "Mismatch in number of rows for values and keys"); - auto sorted_order = sorting == sort_method::STABLE - ? stable_segmented_sorted_order(keys, - segment_offsets, - column_order, - null_precedence, - stream, - rmm::mr::get_current_device_resource()) - : segmented_sorted_order(keys, - segment_offsets, - column_order, - null_precedence, - stream, - rmm::mr::get_current_device_resource()); - - // Gather segmented sort of child value columns` - return detail::gather(values, - sorted_order->view(), - out_of_bounds_policy::DONT_CHECK, - detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); -} - -} // namespace - std::unique_ptr segmented_sorted_order(table_view const& keys, column_view const& segment_offsets, std::vector const& column_order, @@ -327,20 +59,8 @@ std::unique_ptr segmented_sorted_order(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return segmented_sorted_order_common( - keys, segment_offsets, column_order, null_precedence, sort_method::UNSTABLE, stream, mr); -} - -std::unique_ptr stable_segmented_sorted_order( - table_view const& keys, - column_view const& segment_offsets, - std::vector const& column_order, - std::vector const& null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return segmented_sorted_order_common( - keys, segment_offsets, column_order, null_precedence, sort_method::STABLE, stream, mr); + return segmented_sorted_order_common( + keys, segment_offsets, column_order, null_precedence, stream, mr); } std::unique_ptr
segmented_sort_by_key(table_view const& values, @@ -351,26 +71,8 @@ std::unique_ptr
segmented_sort_by_key(table_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return segmented_sort_by_key_common(values, - keys, - segment_offsets, - column_order, - null_precedence, - sort_method::UNSTABLE, - stream, - mr); -} - -std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, - table_view const& keys, - column_view const& segment_offsets, - std::vector const& column_order, - std::vector const& null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return segmented_sort_by_key_common( - values, keys, segment_offsets, column_order, null_precedence, sort_method::STABLE, stream, mr); + return segmented_sort_by_key_common( + values, keys, segment_offsets, column_order, null_precedence, stream, mr); } } // namespace detail @@ -386,18 +88,6 @@ std::unique_ptr segmented_sorted_order(table_view const& keys, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); } -std::unique_ptr stable_segmented_sorted_order( - table_view const& keys, - column_view const& segment_offsets, - std::vector const& column_order, - std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::stable_segmented_sorted_order( - keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); -} - std::unique_ptr
segmented_sort_by_key(table_view const& values, table_view const& keys, column_view const& segment_offsets, @@ -410,16 +100,4 @@ std::unique_ptr
segmented_sort_by_key(table_view const& values, values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); } -std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, - table_view const& keys, - column_view const& segment_offsets, - std::vector const& column_order, - std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::stable_segmented_sort_by_key( - values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); -} - } // namespace cudf diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh new file mode 100644 index 00000000000..a32382b840f --- /dev/null +++ b/cpp/src/sort/segmented_sort_impl.cuh @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace detail { + +/** + * @brief The enum specifying which sorting method to use (stable or unstable). + */ +enum class sort_method { STABLE, UNSTABLE }; + +/** + * @brief Functor performs faster segmented sort on eligible columns + */ +template +struct column_fast_sort_fn { + /** + * @brief Run-time check for faster segmented sort on an eligible column + * + * Fast segmented sort can handle integral types including + * decimal types if dispatch_storage_type is used but it does not support int128. + */ + static bool is_fast_sort_supported(column_view const& col) + { + return !col.has_nulls() and + (cudf::is_integral(col.type()) || + (cudf::is_fixed_point(col.type()) and (col.type().id() != type_id::DECIMAL128))); + } + + /** + * @brief Compile-time check for supporting fast segmented sort for a specific type + * + * The dispatch_storage_type means we can check for integral types to + * include fixed-point types but the CUB limitation means we need to exclude int128. + */ + template + static constexpr bool is_fast_sort_supported() + { + return cudf::is_integral() and !std::is_same_v<__int128, T>; + } + + template + void fast_sort(column_view const& input, + column_view const& segment_offsets, + mutable_column_view& indices, + bool ascending, + rmm::cuda_stream_view stream) + { + // CUB's segmented sort functions cannot accept iterators. + // We create a temporary column here for it to use. + auto temp_col = + cudf::detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream); + mutable_column_view output_view = temp_col->mutable_view(); + + // DeviceSegmentedSort is faster than DeviceSegmentedRadixSort at this time + auto fast_sort_impl = [stream](bool ascending, [[maybe_unused]] auto&&... args) { + rmm::device_buffer d_temp_storage; + size_t temp_storage_bytes = 0; + if (ascending) { + if constexpr (method == sort_method::STABLE) { + cub::DeviceSegmentedSort::StableSortPairs( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; + cub::DeviceSegmentedSort::StableSortPairs( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + } else { + cub::DeviceSegmentedSort::SortPairs( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; + cub::DeviceSegmentedSort::SortPairs( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + } + } else { + if constexpr (method == sort_method::STABLE) { + cub::DeviceSegmentedSort::StableSortPairsDescending( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; + cub::DeviceSegmentedSort::StableSortPairsDescending( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + } else { + cub::DeviceSegmentedSort::SortPairsDescending( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; + cub::DeviceSegmentedSort::SortPairsDescending( + d_temp_storage.data(), temp_storage_bytes, std::forward(args)...); + } + } + }; + + fast_sort_impl(ascending, + input.begin(), + output_view.begin(), + indices.begin(), + indices.begin(), + input.size(), + segment_offsets.size() - 1, + segment_offsets.begin(), + segment_offsets.begin() + 1, + stream.value()); + } + + template ())> + void operator()(column_view const& input, + column_view const& segment_offsets, + mutable_column_view& indices, + bool ascending, + rmm::cuda_stream_view stream) + { + fast_sort(input, segment_offsets, indices, ascending, stream); + } + + template ())> + void operator()( + column_view const&, column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view) + { + CUDF_FAIL("Column type cannot be used with fast-sort function"); + } +}; + +/** + * @brief Performs faster sort on eligible columns + * + * Check the `is_fast_sort_supported()==true` on the input column before using this function. + * + * @param input Column to sort + * @param segment_offsets Identifies segments to sort within + * @param column_order Sort ascending or descending + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ +template +std::unique_ptr fast_segmented_sorted_order(column_view const& input, + column_view const& segment_offsets, + order const& column_order, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Unfortunately, CUB's segmented sort functions cannot accept iterators. + // We have to build a pre-filled sequence of indices as input. + auto sorted_indices = + cudf::detail::sequence(input.size(), numeric_scalar{0}, stream, mr); + auto indices_view = sorted_indices->mutable_view(); + + cudf::type_dispatcher(input.type(), + column_fast_sort_fn{}, + input, + segment_offsets, + indices_view, + column_order == order::ASCENDING, + stream); + return sorted_indices; +} + +/** + * @brief Builds indices to identify segments to sort + * + * The segments are added to the input table-view keys so they + * are lexicographically sorted within the segmented groups. + * + * ``` + * Example 1: + * num_rows = 10 + * offsets = {0, 3, 7, 10} + * segment-indices -> { 3,3,3, 7,7,7,7, 10,10,10 } + * ``` + * + * ``` + * Example 2: (offsets do not cover all indices) + * num_rows = 10 + * offsets = {3, 7} + * segment-indices -> { 0,1,2, 7,7,7,7, 8,9,10 } + * ``` + * + * @param num_rows Total number of rows in the input keys to sort + * @param offsets The offsets identifying the segments + * @param stream CUDA stream used for device memory operations and kernel launches + */ +rmm::device_uvector get_segment_indices(size_type num_rows, + column_view const& offsets, + rmm::cuda_stream_view stream); + +/** + * @brief Segmented sorted-order utility + * + * Returns the indices that map the column to a segmented sorted table. + * Automatically handles calling accelerated code paths as appropriate. + * + * @tparam method Specifies sort is stable or not + * @param keys Table to sort + * @param segment_offsets Identifies the segments within the keys + * @param column_order Sort order for each column in the keys + * @param null_precedence Where to place the null entries for each column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to allocate any returned objects + */ +template +std::unique_ptr segmented_sorted_order_common( + table_view const& keys, + column_view const& segment_offsets, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (keys.num_rows() == 0 || keys.num_columns() == 0) { + return cudf::make_empty_column(type_to_id()); + } + + CUDF_EXPECTS(segment_offsets.type() == data_type(type_to_id()), + "segment offsets should be size_type"); + + if (not column_order.empty()) { + CUDF_EXPECTS(static_cast(keys.num_columns()) == column_order.size(), + "Mismatch between number of columns and column order."); + } + + if (not null_precedence.empty()) { + CUDF_EXPECTS(static_cast(keys.num_columns()) == null_precedence.size(), + "Mismatch between number of columns and null_precedence size."); + } + + // the average row size for which to prefer fast sort + constexpr cudf::size_type MAX_AVG_LIST_SIZE_FOR_FAST_SORT{100}; + // the maximum row count for which to prefer fast sort + constexpr cudf::size_type MAX_LIST_SIZE_FOR_FAST_SORT{1 << 18}; + + // fast-path for single column sort: + // - single-column table + // - not stable-sort + // - no nulls and allowable fixed-width type + // - size and width are limited -- based on benchmark results + if (keys.num_columns() == 1 and + column_fast_sort_fn::is_fast_sort_supported(keys.column(0)) and + (segment_offsets.size() > 0) and + (((keys.num_rows() / segment_offsets.size()) < MAX_AVG_LIST_SIZE_FOR_FAST_SORT) or + (keys.num_rows() < MAX_LIST_SIZE_FOR_FAST_SORT))) { + auto const col_order = column_order.empty() ? order::ASCENDING : column_order.front(); + return fast_segmented_sorted_order( + keys.column(0), segment_offsets, col_order, stream, mr); + } + + // Get segment id of each element in all segments. + auto segment_ids = get_segment_indices(keys.num_rows(), segment_offsets, stream); + + // insert segment id before all columns. + std::vector keys_with_segid; + keys_with_segid.reserve(keys.num_columns() + 1); + keys_with_segid.push_back( + column_view(data_type(type_to_id()), segment_ids.size(), segment_ids.data())); + keys_with_segid.insert(keys_with_segid.end(), keys.begin(), keys.end()); + auto segid_keys = table_view(keys_with_segid); + + auto prepend_default = [](auto const& vector, auto default_value) { + if (vector.empty()) return vector; + std::remove_cv_t> pre_vector; + pre_vector.reserve(pre_vector.size() + 1); + pre_vector.push_back(default_value); + pre_vector.insert(pre_vector.end(), vector.begin(), vector.end()); + return pre_vector; + }; + auto child_column_order = prepend_default(column_order, order::ASCENDING); + auto child_null_precedence = prepend_default(null_precedence, null_order::AFTER); + + // return sorted order of child columns + if constexpr (method == sort_method::STABLE) { + return detail::stable_sorted_order( + segid_keys, child_column_order, child_null_precedence, stream, mr); + } else { + return detail::sorted_order(segid_keys, child_column_order, child_null_precedence, stream, mr); + } +} + +template +std::unique_ptr
segmented_sort_by_key_common(table_view const& values, + table_view const& keys, + column_view const& segment_offsets, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(values.num_rows() == keys.num_rows(), + "Mismatch in number of rows for values and keys"); + auto sorted_order = segmented_sorted_order_common(keys, + segment_offsets, + column_order, + null_precedence, + stream, + rmm::mr::get_current_device_resource()); + // Gather segmented sort of child value columns + return detail::gather(values, + sorted_order->view(), + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu new file mode 100644 index 00000000000..40df1b50279 --- /dev/null +++ b/cpp/src/sort/stable_segmented_sort.cu @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "segmented_sort_impl.cuh" + +#include +#include +#include +#include + +namespace cudf { +namespace detail { + +std::unique_ptr stable_segmented_sorted_order( + table_view const& keys, + column_view const& segment_offsets, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return segmented_sorted_order_common( + keys, segment_offsets, column_order, null_precedence, stream, mr); +} + +std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, + table_view const& keys, + column_view const& segment_offsets, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return segmented_sort_by_key_common( + values, keys, segment_offsets, column_order, null_precedence, stream, mr); +} + +} // namespace detail + +std::unique_ptr stable_segmented_sorted_order( + table_view const& keys, + column_view const& segment_offsets, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::stable_segmented_sorted_order( + keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); +} + +std::unique_ptr
stable_segmented_sort_by_key(table_view const& values, + table_view const& keys, + column_view const& segment_offsets, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::stable_segmented_sort_by_key( + values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr); +} + +} // namespace cudf diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp index 4d25b14588d..3156761243f 100644 --- a/cpp/tests/sort/segmented_sort_tests.cpp +++ b/cpp/tests/sort/segmented_sort_tests.cpp @@ -28,7 +28,6 @@ template using column_wrapper = cudf::test::fixed_width_column_wrapper; -using LCWstr = cudf::test::lists_column_wrapper; template struct SegmentedSort : public cudf::test::BaseFixture { @@ -80,10 +79,10 @@ TYPED_TEST(SegmentedSort, NoNull) { using T = TypeParam; - // segments {0 1 2} {3 4} {5} {6 7 8 9 10}{11 12}{13}{14 15} + // segments {0 1 2} {3 4} {5} {6 7 8 9 10}{11 12}{13}{14 15} column_wrapper col1{{10, 36, 14, 32, 49, 23, 10, 34, 12, 45, 12, 37, 43, 26, 21, 16}}; column_wrapper col2{{10, 63, 41, 23, 94, 32, 10, 43, 21, 54, 22, 73, 34, 62, 12, 61}}; - // segment sorted order {0 2 1} {3 4} {5} {6 8 10 7 9}{11 12}{13}{15 16} + // segment sorted order {0 2 1} {3 4} {5} {6 8 10 7 9}{11 12}{13}{15 16} column_wrapper segments{0, 3, 5, 5, 5, 6, 11, 13, 14, 16}; cudf::table_view input1{{col1}}; cudf::table_view input2{{col1, col2}}; @@ -117,7 +116,7 @@ TYPED_TEST(SegmentedSort, Null) using T = TypeParam; if (std::is_same_v) return; - // segments {0 1 2} {3 4} {5} {6 7 8 9 10}{11 12}{13}{14 15} + // segments {0 1 2}{3 4} {5}{6 7 8 9 10}{11 12}{13}{14 15} column_wrapper col1{{1, 3, 2, 4, 5, 23, 6, 8, 7, 9, 7, 37, 43, 26, 21, 16}, {1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}}; column_wrapper col2{{0, 0, 0, 1, 1, 4, 5, 5, 21, 5, 22, 6, 6, 7, 8, 8}, @@ -165,6 +164,57 @@ TYPED_TEST(SegmentedSort, Null) CUDF_TEST_EXPECT_TABLES_EQUAL(results->view(), expected12_ab); } +TYPED_TEST(SegmentedSort, StableNoNulls) +{ + using T = TypeParam; + + // segments {0 1 2} {3 4} {5} {6 7 8 9 10}{11 12}{13}{14 15} + column_wrapper col1{{10, 36, 14, 32, 49, 23, 10, 34, 12, 45, 11, 37, 43, 26, 21, 16}}; + column_wrapper col2{{10, 63, 10, 23, 94, 32, 10, 43, 22, 43, 22, 34, 34, 62, 62, 61}}; + // stable sorted order {0 2 1} {3 4} {5} {6 8 10 7 9}{11 12}{13}{16 15} + column_wrapper segments{0, 3, 5, 5, 5, 6, 11, 13, 14, 16}; + auto values = cudf::table_view{{col1}}; + auto keys = cudf::table_view{{col2}}; + + // Ascending + column_wrapper col_asc{{10, 14, 36, 32, 49, 23, 10, 12, 11, 34, 45, 37, 43, 26, 16, 21}}; + auto results = + cudf::stable_segmented_sort_by_key(values, keys, segments, {cudf::order::ASCENDING}); + CUDF_TEST_EXPECT_TABLES_EQUAL(results->view(), cudf::table_view{{col_asc}}); + // Descending + column_wrapper col_des{{36, 10, 14, 49, 32, 23, 34, 45, 12, 11, 10, 37, 43, 26, 21, 16}}; + results = cudf::stable_segmented_sort_by_key(values, keys, segments, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_TABLES_EQUAL(results->view(), cudf::table_view{{col_des}}); +} + +TYPED_TEST(SegmentedSort, StableWithNulls) +{ + using T = TypeParam; + + // segments {0 1 2} {3 4} {5} {6 7 8 9 10}{11 12}{13}{14 15} + column_wrapper col1{{10, 36, 0, 32, 49, 23, 10, 0, 12, 45, 11, 37, 43, 0, 21, 16}, + {1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1}}; + column_wrapper col2{{10, 0, 10, 23, 94, 32, 0, 43, 0, 43, 0, 34, 34, 62, 62, 61}, + {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; + // stable sorted order {0 2 1} {3 4} {5} {6 8 10 7 9}{11 12}{13}{16 15} + column_wrapper segments{0, 3, 5, 5, 5, 6, 11, 13, 14, 16}; + auto values = cudf::table_view{{col1}}; + auto keys = cudf::table_view{{col2}}; + + // Ascending + column_wrapper col_asc{{36, 10, 0, 32, 49, 23, 10, 12, 11, 0, 45, 37, 43, 0, 16, 21}, + {1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1}}; + auto results = + cudf::stable_segmented_sort_by_key(values, keys, segments, {cudf::order::ASCENDING}); + CUDF_TEST_EXPECT_TABLES_EQUAL(results->view(), cudf::table_view{{col_asc}}); + + // Descending + column_wrapper col_des{{10, 0, 36, 49, 32, 23, 0, 45, 12, 11, 10, 37, 43, 0, 21, 16}, + {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}}; + results = cudf::stable_segmented_sort_by_key(values, keys, segments, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_TABLES_EQUAL(results->view(), cudf::table_view{{col_des}}); +} + TEST_F(SegmentedSortInt, NonZeroSegmentsStart) { using T = int; @@ -184,14 +234,28 @@ TEST_F(SegmentedSortInt, NonZeroSegmentsStart) cudf::table_view input{{col1}}; auto results = cudf::segmented_sorted_order(input, segments1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected1); + results = cudf::stable_segmented_sorted_order(input, segments1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected1); + results = cudf::segmented_sorted_order(input, segments2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2); + results = cudf::stable_segmented_sorted_order(input, segments2); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2); + results = cudf::segmented_sorted_order(input, segments3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3); + results = cudf::stable_segmented_sorted_order(input, segments3); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3); + results = cudf::segmented_sorted_order(input, segments4); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected4); + results = cudf::stable_segmented_sorted_order(input, segments4); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected4); + results = cudf::segmented_sorted_order(input, segments5); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected5); + results = cudf::stable_segmented_sorted_order(input, segments5); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected5); } TEST_F(SegmentedSortInt, Sliced) @@ -214,12 +278,18 @@ TEST_F(SegmentedSortInt, Sliced) // sliced input auto results = cudf::segmented_sorted_order(input, segments1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected1); + results = cudf::stable_segmented_sorted_order(input, segments1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected1); // sliced input and sliced segment results = cudf::segmented_sorted_order(input, seg_slice); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2); + results = cudf::stable_segmented_sorted_order(input, seg_slice); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2); // sliced input, segment end. results = cudf::segmented_sorted_order(input, segments3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3); + results = cudf::stable_segmented_sorted_order(input, segments3); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3); } TEST_F(SegmentedSortInt, ErrorsMismatchArgSizes) @@ -228,24 +298,24 @@ TEST_F(SegmentedSortInt, ErrorsMismatchArgSizes) column_wrapper col1{{5, 6, 7, 8, 9}}; column_wrapper segments{{1, 2, 3, 4}}; cudf::table_view input1{{col1}}; + std::vector order{cudf::order::ASCENDING, cudf::order::ASCENDING}; + std::vector null_order{cudf::null_order::AFTER, cudf::null_order::AFTER}; // Mismatch order sizes - EXPECT_THROW(cudf::segmented_sort_by_key( - input1, input1, segments, {cudf::order::ASCENDING, cudf::order::ASCENDING}, {}), - cudf::logic_error); + EXPECT_THROW(cudf::segmented_sort_by_key(input1, input1, segments, order, {}), cudf::logic_error); + EXPECT_THROW(cudf::stable_segmented_sorted_order(input1, segments, order, {}), cudf::logic_error); // Mismatch null precedence sizes - EXPECT_THROW(cudf::segmented_sort_by_key( - input1, input1, segments, {}, {cudf::null_order::AFTER, cudf::null_order::AFTER}), + EXPECT_THROW(cudf::segmented_sorted_order(input1, segments, {}, null_order), cudf::logic_error); + EXPECT_THROW(cudf::stable_segmented_sort_by_key(input1, input1, segments, {}, null_order), cudf::logic_error); // Both - EXPECT_THROW(cudf::segmented_sort_by_key(input1, - input1, - segments, - {cudf::order::ASCENDING, cudf::order::ASCENDING}, - {cudf::null_order::AFTER, cudf::null_order::AFTER}), + EXPECT_THROW(cudf::segmented_sort_by_key(input1, input1, segments, order, null_order), + cudf::logic_error); + EXPECT_THROW(cudf::stable_segmented_sort_by_key(input1, input1, segments, order, null_order), cudf::logic_error); // segmented_offsets beyond num_rows - undefined behavior, no throw. CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(input1, input1, segments)); + CUDF_EXPECT_NO_THROW(cudf::stable_segmented_sort_by_key(input1, input1, segments)); } TEST_F(SegmentedSortInt, Bool) @@ -257,12 +327,13 @@ TEST_F(SegmentedSortInt, Bool) cudf::test::fixed_width_column_wrapper segments{{0, 5, 10, 15, 20, 25, 30, 40}}; - auto test_col = cudf::column_view{col1}; - auto result = cudf::segmented_sorted_order(cudf::table_view({test_col}), segments); - cudf::test::fixed_width_column_wrapper expected( {1, 2, 0, 3, 4, 5, 6, 7, 8, 9, 13, 14, 10, 11, 12, 15, 16, 18, 19, 17, 20, 21, 22, 23, 24, 27, 29, 25, 26, 28, 36, 38, 39, 30, 31, 32, 33, 34, 35, 37}); + auto test_col = cudf::column_view{col1}; + auto result = cudf::segmented_sorted_order(cudf::table_view({test_col}), segments); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::stable_segmented_sorted_order(cudf::table_view({test_col}), segments); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); }