diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index d286f43e0eb..cc336c524fa 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -169,13 +169,11 @@ outputs: - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/contains.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp - - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/extract.hpp - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - test -f $PREFIX/include/cudf/lists/detail/stream_compaction.hpp - - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/explode.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp - test -f $PREFIX/include/cudf/lists/filling.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0903609c1e2..250d7849cda 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -379,7 +379,6 @@ add_library( src/lists/copying/segmented_gather.cu src/lists/copying/scatter_helper.cu src/lists/count_elements.cu - src/lists/drop_list_duplicates.cu src/lists/explode.cu src/lists/extract.cu src/lists/interleave_columns.cu diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp deleted file mode 100644 index 8cde8c1708c..00000000000 --- a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -namespace cudf { -namespace lists { -namespace detail { -/** - * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&, - * lists_column_view const&, - * duplicate_keep_option, - * null_equality, - * nan_equality, - * rmm::mr::device_memory_resource*) - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr drop_list_duplicates( - lists_column_view const& keys, - lists_column_view const& values, - duplicate_keep_option keep_option, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&, - * null_equality, - * nan_equality, - * rmm::mr::device_memory_resource*) - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr drop_list_duplicates( - lists_column_view const& input, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace detail -} // namespace lists -} // namespace cudf diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp deleted file mode 100644 index 123ec69a7aa..00000000000 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -#include - -namespace cudf { -namespace lists { -/** - * @addtogroup lists_drop_duplicates - * @{ - * @file - */ - -/** - * @brief Copy the elements from the lists in `keys` and associated `values` columns according to - * the unique elements in `keys`. - * - * For each list in `keys` and associated `values`, according to the parameter `keep_option`, copy - * the unique elements from the list in `keys` and their corresponding elements in `values` to new - * lists. Order of the output elements within each list are not guaranteed to be preserved as in the - * input. - * - * Behavior is undefined if `count_elements(keys)[i] != count_elements(values)[i]` for all `i` in - * `[0, keys.size())`. - * - * @throw cudf::logic_error If the child column of the input keys column contains nested type other - * than STRUCT. - * @throw cudf::logic_error If `keys.size() != values.size()`. - * - * @param keys The input keys lists column to check for uniqueness and copy unique elements. - * @param values The values lists column in which the elements are mapped to elements in the key - * column. - * @param nulls_equal Flag to specify whether null key elements should be considered as equal. - * @param nans_equal Flag to specify whether NaN key elements should be considered as equal - * (only applicable for floating point keys elements). - * @param keep_option Flag to specify which elements will be copied from the input to the output. - * @param mr Device resource used to allocate memory. - * - * @code{.pseudo} - * keys = { {1, 1, 2, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } - * values = { {"a", "b", "c", "d"}, {"e"}, NULL, {}, {"N0", "N1", "N2", "f", "g", "h", "i", "j"} } - * - * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_FIRST) - * out_keys = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } - * out_values = { {"a", "c", "d"}, {"e"}, NULL, {}, {"f", "g", "N0"} } - * - * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_LAST) - * out_keys = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } - * out_values = { {"b", "c", "d"}, {"e"}, NULL, {}, {"j", "i", "N2"} } - * - * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_NONE) - * out_keys = { {2, 3}, {4}, NULL, {}, {} } - * out_values = { {"c", "d"}, {"e"}, NULL, {}, {} } - * @endcode - * - * @return A pair of lists columns storing the results from extracting unique key elements and their - * corresponding values elements from the input. - */ -std::pair, std::unique_ptr> drop_list_duplicates( - lists_column_view const& keys, - lists_column_view const& values, - duplicate_keep_option keep_option = duplicate_keep_option::KEEP_FIRST, - null_equality nulls_equal = null_equality::EQUAL, - nan_equality nans_equal = nan_equality::UNEQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Create a new list column by copying elements from the input lists column ignoring - * duplicate list elements. - * - * Given a lists column, an output lists column is generated by copying elements from the input - * lists column in a way such that the duplicate elements in each list are ignored, producing only - * unique list elements. - * - * Order of the output elements are not guaranteed to be preserved as in the input. - * - * @throw cudf::logic_error If the child column of the input lists column contains nested type other - * than STRUCT. - * - * @param input The input lists column to check and copy unique elements. - * @param nulls_equal Flag to specify whether null key elements should be considered as equal. - * @param nans_equal Flag to specify whether NaN key elements should be considered as equal - * (only applicable for floating point keys column). - * @param mr Device resource used to allocate memory. - * - * @code{.pseudo} - * input = { {1, 1, 2, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } - * drop_list_duplicates(input) = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } - * @endcode - * - * @return A lists column storing the results from extracting unique list elements from the input. - */ -std::unique_ptr drop_list_duplicates( - lists_column_view const& input, - null_equality nulls_equal = null_equality::EQUAL, - nan_equality nans_equal = nan_equality::UNEQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** @} */ // end of group -} // namespace lists -} // namespace cudf diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu deleted file mode 100644 index d0700f439ce..00000000000 --- a/cpp/src/lists/drop_list_duplicates.cu +++ /dev/null @@ -1,638 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -namespace cudf::lists { -namespace detail { - -namespace { -template -struct has_negative_nans_fn { - column_device_view const d_view; - - has_negative_nans_fn(column_device_view const& d_view) : d_view(d_view) {} - - __device__ Type operator()(size_type idx) const noexcept - { - if (d_view.is_null(idx)) { return false; } - - auto const val = d_view.element(idx); - return std::isnan(val) && std::signbit(val); // std::signbit(x) == true if x is negative - } -}; - -/** - * @brief A structure to be used along with type_dispatcher to check if a column has any - * negative NaN value. - * - * This functor is necessary because when calling to segmented sort on the list entries, the - * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output - * lists. We want to move all NaN values close together in order to call unique_copy later on. - */ -struct has_negative_nans_dispatch { - template >* = nullptr> - bool operator()(column_view const& input, rmm::cuda_stream_view stream) const noexcept - { - auto const d_entries_ptr = column_device_view::create(input, stream); - return thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - has_negative_nans_fn{*d_entries_ptr}); - } - - template >* = nullptr> - bool operator()(column_view const& input, rmm::cuda_stream_view stream) const - { - // Recursively check negative NaN on the children columns. - return std::any_of(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.num_children()), - [structs_view = structs_column_view{input}, stream](auto const child_idx) { - auto const col = structs_view.get_sliced_child(child_idx); - return type_dispatcher( - col.type(), has_negative_nans_dispatch{}, col, stream); - }); - } - - template && - !std::is_same_v>* = nullptr> - bool operator()(column_view const&, rmm::cuda_stream_view) const - { - // Non-nested columns of non floating-point data do not contain NaN. - // Nested columns (not STRUCT) are not supported and should not reach this point. - return false; - } -}; - -/** - * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for a - * floating-point data column. - * - * Replacing -NaN by NaN is necessary before calling to segmented sort for lists because the sorting - * API may separate -NaN and NaN to the two ends of each result list while we want to group all NaN - * together. - */ -struct replace_negative_nans_dispatch { - template >* = nullptr> - std::unique_ptr operator()(column_view const& input, - rmm::cuda_stream_view stream) const noexcept - { - return cuda::std::is_floating_point_v - ? cudf::detail::normalize_nans_and_zeros(input, stream) - : std::make_unique(input, stream); - } - - template >* = nullptr> - std::unique_ptr operator()(column_view const& input, - rmm::cuda_stream_view stream) const noexcept - { - std::vector> output_struct_members; - std::transform(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.num_children()), - std::back_inserter(output_struct_members), - [structs_view = structs_column_view{input}, stream](auto const child_idx) { - auto const col = structs_view.get_sliced_child(child_idx); - return type_dispatcher( - col.type(), replace_negative_nans_dispatch{}, col, stream); - }); - - return cudf::make_structs_column(input.size(), - std::move(output_struct_members), - input.null_count(), - cudf::detail::copy_bitmask(input, stream), - stream); - } -}; - -/** - * @brief Perform an equality comparison between two entries in a lists column, specialized from - * `cudf::element_equality_comparator` to take into account both parameters `nulls_equal` and - * `nans_equal` when comparing floating-point numbers. - * - * For the two entries that are NOT in the same list, they will always be considered as different. - * - * If they are from the same list and their type is not floating point, this functor will return the - * same comparison result as `cudf::element_equality_comparator`. - * - * For floating-point types, entries holding NaN value can be considered as different or the same - * value depending on the `nans_equal` parameter. - */ -template -struct column_row_comparator_fn { - size_type const* const list_indices; - column_device_view const lhs; - column_device_view const rhs; - null_equality const nulls_equal; - bool const has_nulls; - bool const nans_equal; - - __host__ __device__ column_row_comparator_fn(size_type const* const list_indices, - column_device_view const& lhs, - column_device_view const& rhs, - null_equality const nulls_equal, - bool const has_nulls, - bool const nans_equal) - : list_indices(list_indices), - lhs(lhs), - rhs(rhs), - nulls_equal(nulls_equal), - has_nulls(has_nulls), - nans_equal(nans_equal) - { - } - - template >* = nullptr> - bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept - { - return lhs_val == rhs_val; - } - - template >* = nullptr> - bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept - { - // If both element(i) and element(j) are NaNs and NaNs are considered as equal value then this - // comparison will return `true`. This is the desired behavior in Pandas. - if (nans_equal && std::isnan(lhs_val) && std::isnan(rhs_val)) { return true; } - - // If NaNs are considered as NOT equal, even both element(i) and element(j) are NaNs this - // comparison will still return `false`. This is the desired behavior in Apache Spark. - return lhs_val == rhs_val; - } - - bool __device__ operator()(size_type i, size_type j) const noexcept - { - // Two entries are not considered for equality if they belong to different lists. - if (list_indices[i] != list_indices[j]) { return false; } - - if (has_nulls) { - bool const lhs_is_null{lhs.nullable() && lhs.is_null_nocheck(i)}; - bool const rhs_is_null{rhs.nullable() && rhs.is_null_nocheck(j)}; - if (lhs_is_null && rhs_is_null) { - return nulls_equal == null_equality::EQUAL; - } else if (lhs_is_null != rhs_is_null) { - return false; - } - } - - return compare(lhs.element(i), lhs.element(j)); - } -}; - -/** - * @brief Struct used in type_dispatcher for comparing two entries in a lists column. - */ -struct column_row_comparator_dispatch { - size_type const* const list_indices; - column_device_view const lhs; - column_device_view const rhs; - null_equality const nulls_equal; - bool const has_nulls; - bool const nans_equal; - - __device__ column_row_comparator_dispatch(size_type const* const list_indices, - column_device_view const& lhs, - column_device_view const& rhs, - null_equality const nulls_equal, - bool const has_nulls, - bool const nans_equal) - : list_indices(list_indices), - lhs(lhs), - rhs(rhs), - nulls_equal(nulls_equal), - has_nulls(has_nulls), - nans_equal(nans_equal) - { - } - - template ()>* = nullptr> - bool __device__ operator()(size_type i, size_type j) const noexcept - { - return column_row_comparator_fn{ - list_indices, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j); - } - - template ()>* = nullptr> - bool operator()(size_type, size_type) const - { - CUDF_FAIL( - "column_row_comparator_dispatch cannot operate on types that are not equally comparable."); - } -}; - -/** - * @brief Performs an equality comparison between rows of two tables using - * `column_row_comparator_fn` functor to compare rows of their corresponding columns. - */ -struct table_row_comparator_fn { - size_type const* const list_indices; - table_device_view const lhs; - table_device_view const rhs; - null_equality const nulls_equal; - bool const has_nulls; - bool const nans_equal; - - table_row_comparator_fn(size_type const* const list_indices, - table_device_view const& lhs, - table_device_view const& rhs, - null_equality const nulls_equal, - bool const has_nulls, - bool const nans_equal) - : list_indices(list_indices), - lhs(lhs), - rhs(rhs), - nulls_equal(nulls_equal), - has_nulls(has_nulls), - nans_equal(nans_equal) - { - } - - bool __device__ operator()(size_type i, size_type j) const - { - auto column_comp = [=](column_device_view const& lhs, column_device_view const& rhs) { - return type_dispatcher( - lhs.type(), - column_row_comparator_dispatch{list_indices, lhs, rhs, nulls_equal, has_nulls, nans_equal}, - i, - j); - }; - - return thrust::equal(thrust::seq, lhs.begin(), lhs.end(), rhs.begin(), column_comp); - } -}; - -/** - * @brief Struct used in type_dispatcher for copying indices of the list entries ignoring duplicate - * list entries. - */ -struct get_indices_of_unique_entries_dispatch { - template () && - !std::is_same_v>* = nullptr> - size_type* operator()(size_type const*, - column_view const&, - size_type, - size_type*, - null_equality, - nan_equality, - duplicate_keep_option, - rmm::cuda_stream_view) const - { - CUDF_FAIL( - "get_indices_of_unique_entries_dispatch cannot operate on types that are not equally " - "comparable or not STRUCT type."); - } - - template ()>* = nullptr> - size_type* operator()(size_type const* list_indices, - column_view const& all_lists_entries, - size_type num_entries, - size_type* output_begin, - null_equality nulls_equal, - nan_equality nans_equal, - duplicate_keep_option keep_option, - rmm::cuda_stream_view stream) const noexcept - { - auto const d_view = column_device_view::create(all_lists_entries, stream); - auto const comp = column_row_comparator_fn{list_indices, - *d_view, - *d_view, - nulls_equal, - all_lists_entries.has_nulls(), - nans_equal == nan_equality::ALL_EQUAL}; - return cudf::detail::unique_copy(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp, - keep_option, - stream); - } - - template >* = nullptr> - size_type* operator()(size_type const* list_indices, - column_view const& all_lists_entries, - size_type num_entries, - size_type* output_begin, - null_equality nulls_equal, - nan_equality nans_equal, - duplicate_keep_option keep_option, - rmm::cuda_stream_view stream) const noexcept - { - auto const flattened_entries = cudf::structs::detail::flatten_nested_columns( - table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {}); - auto const dview_ptr = table_device_view::create(flattened_entries, stream); - // Search through children of all levels for nulls. - auto const nested_has_nulls = has_nulls(flattened_entries.flattened_columns()); - - auto const comp = table_row_comparator_fn{list_indices, - *dview_ptr, - *dview_ptr, - nulls_equal, - nested_has_nulls, - nans_equal == nan_equality::ALL_EQUAL}; - return cudf::detail::unique_copy(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp, - keep_option, - stream); - } -}; - -/** - * @brief Extract list entries and their corresponding (1-based) list indices ignoring duplicate - * entries. - */ -std::vector> get_unique_entries_and_list_indices( - column_view const& keys_entries, - std::optional const& values_entries, - device_span entries_list_indices, - null_equality nulls_equal, - nan_equality nans_equal, - duplicate_keep_option keep_option, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_entries = keys_entries.size(); - - // Allocate memory to store the indices of the unique key entries. - // These indices will be used as a gather map to collect keys and values. - auto unique_indices = rmm::device_uvector(num_entries, stream); - auto const output_begin = unique_indices.begin(); - auto const output_end = type_dispatcher(keys_entries.type(), - get_indices_of_unique_entries_dispatch{}, - entries_list_indices.begin(), - keys_entries, - num_entries, - output_begin, - nulls_equal, - nans_equal, - keep_option, - stream); - - auto const list_indices_view = column_view(data_type{type_to_id()}, - static_cast(entries_list_indices.size()), - entries_list_indices.data()); - auto const input_table = values_entries - ? table_view{{keys_entries, values_entries.value(), list_indices_view}} - : table_view{{keys_entries, list_indices_view}}; - - // Collect unique entries and entry list indices. - // The new null_count and bitmask of the unique entries will also be generated by the gather - // function. - return cudf::detail::gather(input_table, - device_span( - unique_indices.data(), thrust::distance(output_begin, output_end)), - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr) - ->release(); -} - -/** - * @brief Common execution code called by all public `drop_list_duplicates` APIs. - */ -std::pair, std::unique_ptr> drop_list_duplicates_common( - lists_column_view const& keys, - std::optional const& values, - null_equality nulls_equal, - nan_equality nans_equal, - duplicate_keep_option keep_option, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (auto const child_type = keys.child().type(); - cudf::is_nested(child_type) && child_type.id() != type_id::STRUCT) { - CUDF_FAIL( - "Keys of nested types other than STRUCT are not supported in `drop_list_duplicates`."); - } - - CUDF_EXPECTS(!values || keys.size() == values.value().size(), - "Keys and values columns must have the same size."); - - if (keys.is_empty()) { - return std::pair{cudf::empty_like(keys.parent()), - values ? cudf::empty_like(values.value().parent()) : nullptr}; - } - - // The child column containing list entries. - auto const keys_child = keys.get_sliced_child(stream); - - // Generate a mapping from list entries to their list indices for the keys column. - auto const entries_list_indices = [&] { - auto labels = rmm::device_uvector(keys_child.size(), stream); - cudf::detail::label_segments( - keys.offsets_begin(), keys.offsets_end(), labels.begin(), labels.end(), stream); - return labels; - }(); - - // Generate segmented sorted order for key entries. - // The keys column will be sorted (gathered) using this order. - auto const sorted_order = [&]() { - auto const list_indices_view = column_view(data_type{type_to_id()}, - static_cast(entries_list_indices.size()), - entries_list_indices.data()); - - // If nans_equal == ALL_EQUAL and the keys column contains floating-point data type, - // we need to replace `-NaN` by `NaN` before sorting. - auto const replace_negative_nan = - nans_equal == nan_equality::ALL_EQUAL && - type_dispatcher(keys_child.type(), has_negative_nans_dispatch{}, keys_child, stream); - - if (replace_negative_nan) { - auto const replaced_nan_keys_child = - type_dispatcher(keys_child.type(), replace_negative_nans_dispatch{}, keys_child, stream); - return cudf::detail::stable_sorted_order( - table_view{{list_indices_view, replaced_nan_keys_child->view()}}, - {order::ASCENDING, order::ASCENDING}, - {null_order::AFTER, null_order::AFTER}, - stream); - } else { - return cudf::detail::stable_sorted_order(table_view{{list_indices_view, keys_child}}, - {order::ASCENDING, order::ASCENDING}, - {null_order::AFTER, null_order::AFTER}, - stream); - } - }(); - - auto const sorting_table = values - ? table_view{{keys_child, values.value().get_sliced_child(stream)}} - : table_view{{keys_child}}; - auto const sorted_table = cudf::detail::gather(sorting_table, - sorted_order->view(), - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream); - - // Extract the segmented sorted key entries. - auto const sorted_keys_entries = sorted_table->get_column(0).view(); - auto const sorted_values_entries = - values ? std::optional(sorted_table->get_column(1).view()) : std::nullopt; - - // Generate child columns containing unique entries (along with their list indices). - // null_count and bitmask of these columns will also be generated in this function. - auto unique_entries_and_list_indices = get_unique_entries_and_list_indices(sorted_keys_entries, - sorted_values_entries, - entries_list_indices, - nulls_equal, - nans_equal, - keep_option, - stream, - mr); - - // Generate offsets for the output lists column(s). - auto output_offsets = [&] { - auto out_offsets = make_numeric_column( - data_type{type_to_id()}, keys.size() + 1, mask_state::UNALLOCATED, stream, mr); - auto const offsets = out_offsets->mutable_view(); - auto const labels = - unique_entries_and_list_indices.back()->view(); // unique entries' list indices - cudf::detail::labels_to_offsets(labels.template begin(), - labels.template end(), - offsets.template begin(), - offsets.template end(), - stream); - return out_offsets; - }(); - - // If the values lists column is not given, its corresponding output will be nullptr. - auto out_values = - values ? make_lists_column(keys.size(), - std::make_unique(output_offsets->view(), stream, mr), - std::move(unique_entries_and_list_indices[1]), - values.value().null_count(), - cudf::detail::copy_bitmask(values.value().parent(), stream, mr), - stream, - mr) - : nullptr; - - auto out_keys = make_lists_column(keys.size(), - std::move(output_offsets), - std::move(unique_entries_and_list_indices[0]), - keys.null_count(), - cudf::detail::copy_bitmask(keys.parent(), stream, mr), - stream, - mr); - - return std::pair{std::move(out_keys), std::move(out_values)}; -} - -} // anonymous namespace - -std::pair, std::unique_ptr> drop_list_duplicates( - lists_column_view const& keys, - lists_column_view const& values, - null_equality nulls_equal, - nan_equality nans_equal, - duplicate_keep_option keep_option, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return drop_list_duplicates_common(keys, - std::optional(values), - nulls_equal, - nans_equal, - keep_option, - stream, - mr); -} - -std::unique_ptr drop_list_duplicates(lists_column_view const& input, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return drop_list_duplicates_common(input, - std::nullopt, - nulls_equal, - nans_equal, - duplicate_keep_option::KEEP_FIRST, - stream, - mr) - .first; -} - -} // namespace detail - -/** - * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&, - * lists_column_view const&, - * duplicate_keep_option, - * null_equality, - * nan_equality, - * rmm::mr::device_memory_resource*) - */ -std::pair, std::unique_ptr> drop_list_duplicates( - lists_column_view const& keys, - lists_column_view const& values, - duplicate_keep_option keep_option, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::drop_list_duplicates( - keys, values, nulls_equal, nans_equal, keep_option, cudf::default_stream_value, mr); -} - -/** - * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&, - * null_equality, - * nan_equality, - * rmm::mr::device_memory_resource*) - */ -std::unique_ptr drop_list_duplicates(lists_column_view const& input, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::drop_list_duplicates( - input, nulls_equal, nans_equal, cudf::default_stream_value, mr); -} - -} // namespace cudf::lists diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a1e3cfed286..d00fa6633de 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -477,7 +477,6 @@ ConfigureTest( lists/combine/concatenate_rows_tests.cpp lists/contains_tests.cpp lists/count_elements_tests.cpp - lists/drop_list_duplicates_tests.cpp lists/explode_tests.cpp lists/extract_tests.cpp lists/sequences_tests.cpp diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp deleted file mode 100644 index 54d7ba0a95e..00000000000 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ /dev/null @@ -1,921 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -#include -#include - -using namespace cudf::test::iterators; - -using float_type = float; -using IntListsCol = cudf::test::lists_column_wrapper; -using FloatListsCol = cudf::test::lists_column_wrapper; -using StrListsCol = cudf::test::lists_column_wrapper; -using StringsCol = cudf::test::strings_column_wrapper; -using StructsCol = cudf::test::structs_column_wrapper; -using IntsCol = cudf::test::fixed_width_column_wrapper; -using FloatsCol = cudf::test::fixed_width_column_wrapper; - -auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); -auto constexpr neg_Inf = -std::numeric_limits::infinity(); -auto constexpr NaN = std::numeric_limits::quiet_NaN(); -auto constexpr Inf = std::numeric_limits::infinity(); -auto constexpr verbosity = cudf::test::debug_output_level::FIRST_ERROR; - -struct DropListDuplicatesTest : public cudf::test::BaseFixture { -}; - -TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) -{ - // -0.0 and 0.0 should be considered equal. - auto const keys = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3}; - auto const vals = - StrListsCol{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"}; - auto const expected_keys = FloatListsCol{0, 1, 2, 3}; - - // Remove duplicates only from keys. - { - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_keys, verbosity); - } - - // Remove duplicates with KEEP_FIRST. - { - auto const expected_vals = StrListsCol{"1", "2", "3", "14"}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_FIRST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_LAST. - { - auto const expected_vals = StrListsCol{"13", "8", "9", "14"}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_NONE. - { - auto const expected_keys = FloatListsCol{3}; - auto const expected_vals = StrListsCol{"14"}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_NONE); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } -} - -TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) -{ - auto const keys = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}; - auto const vals = IntListsCol{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - auto const expected_keys = FloatListsCol{neg_Inf, 0, Inf}; - - // Remove duplicates only from keys. - { - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_keys, verbosity); - } - - // Remove duplicates with KEEP_FIRST. - { - auto const expected_vals = IntListsCol{3, 2, 1}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_FIRST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_LAST. - { - auto const expected_vals = IntListsCol{11, 10, 9}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_NONE. - { - auto const expected_keys = FloatListsCol{FloatListsCol{}}; - auto const expected_vals = IntListsCol{IntListsCol{}}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - // exit(0); -} - -// The position of NaN is undefined after sorting, thus we need to offload the data to CPU to -// check for validity. -// We will not store NaN in the results_expected variable (an unordered_set) because we can't check -// for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare -// with the number of NaNs in the output. -static void test_floating_point(std::vector const& h_input, - std::unordered_set const& results_expected, - cudf::nan_equality nans_equal) -{ - // If NaNs are considered as equal value, the final result should always contain at max ONE NaN - // entry per list. - std::size_t const num_NaNs = - nans_equal == cudf::nan_equality::ALL_EQUAL - ? std::size_t{1} - : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); - - auto const results_col = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{FloatListsCol(h_input.begin(), h_input.end())}, - cudf::null_equality::EQUAL, - nans_equal); - auto const results_arr = - cudf::test::to_host(cudf::lists_column_view(results_col->view()).child()).first; - - EXPECT_EQ(results_arr.size(), results_expected.size() + num_NaNs); - - std::size_t NaN_count{0}; - std::unordered_set results; - for (auto const x : results_arr) { - if (std::isnan(x)) { - ++NaN_count; - } else { - results.insert(x); - } - } - EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == num_NaNs); -} - -TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs) -{ - std::vector h_input{ - 0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; - std::unordered_set results_expected{-2, -1, 0, 1, 2}; - test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); - test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); -} - -TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) -{ - std::vector h_input{neg_Inf, 0, neg_NaN, 1, -1, -2, NaN, NaN, Inf, NaN, - neg_NaN, 2, -1, 0, neg_NaN, 1, 2, Inf, 0, 1, - neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf}; - std::unordered_set results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf}; - test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL); - test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL); -} - -TEST_F(DropListDuplicatesTest, StringTestsNonNull) -{ - // Trivial cases - empty input. - { - auto const lists = StrListsCol{{}}; - auto const expected = StrListsCol{{}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - // No duplicate entry. - { - auto const lists = StrListsCol{"this", "is", "a", "string"}; - auto const expected = StrListsCol{"a", "is", "string", "this"}; - auto const results = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{lists}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - // One list column. - { - auto const lists = StrListsCol{"this", "is", "is", "is", "a", "string", "string"}; - auto const expected = StrListsCol{"a", "is", "string", "this"}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - // One list column, input is a strings column with given non-default null_equality and - // nans_equality parameters. - { - auto const lists = StrListsCol{"this", "is", "is", "is", "a", "string", "string"}; - auto const expected = StrListsCol{"a", "is", "string", "this"}; - auto const results = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{lists}, cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - // Multiple lists column. - { - auto const lists = - StrListsCol{StrListsCol{"this", "is", "a", "no duplicate", "string"}, - StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}, - StrListsCol{"this", "is", "is", "is", "a", "two duplicates", "string"}, - StrListsCol{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}; - auto const expected = StrListsCol{StrListsCol{"a", "is", "no duplicate", "string", "this"}, - StrListsCol{"a", "is", "one duplicate", "string", "this"}, - StrListsCol{"a", "is", "string", "this", "two duplicates"}, - StrListsCol{"a", "is", "string", "this", "three duplicates"}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } -} - -TEST_F(DropListDuplicatesTest, StringTestsWithNulls) -{ - auto const null = std::string(""); - - // One list column with null entries. - { - auto const lists = StrListsCol{ - {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; - auto const expected = StrListsCol{{"a", "is", "string", "this", null}, null_at(4)}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - // Multiple lists column with null lists and null entries - { - auto const lists = StrListsCol{ - {StrListsCol{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, - nulls_at({1, 3, 5, 7})}, - StrListsCol{}, /* NULL */ - StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}}, - null_at(1)}; - auto const expected = - StrListsCol{{StrListsCol{{"a", "is", "no duplicate", "string", "this", null}, null_at(5)}, - StrListsCol{}, /* NULL */ - StrListsCol{"a", "is", "one duplicate", "string", "this"}}, - null_at(1)}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } -} - -template -struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture { -}; - -using TypesForTest = - cudf::test::Concat; -TYPED_TEST_SUITE(DropListDuplicatesTypedTest, TypesForTest); - -TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests) -{ - using ListsCol = cudf::test::lists_column_wrapper; - - // Nested types (except struct) are not supported. - EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{ListsCol{ListsCol{{1, 2}, {3}}}}), - cudf::logic_error); -} - -TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) -{ - using ListsCol = cudf::test::lists_column_wrapper; - - // Empty input. - { - auto const lists = ListsCol{}; - auto const expected = ListsCol{}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - - auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{lists}, cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); - } - - // All input lists are empty. - { - auto const lists = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}; - auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - - auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{lists}, cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); - } - - // Trivial cases. - { - auto const lists = ListsCol{0, 1, 2, 3, 4, 5}; - auto const expected = ListsCol{0, 1, 2, 3, 4, 5}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - - auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{lists}, cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); - } - - // Multiple empty lists. - { - auto const lists = ListsCol{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}; - auto const expected = ListsCol{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - - auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{lists}, cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); - } -} - -TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests) -{ - using ListsCol = cudf::test::lists_column_wrapper; - - // Adjacent lists containing the same entries. - { - auto const keys = - ListsCol{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}; - auto const vals = - ListsCol{{1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}}; - auto const expected_keys = ListsCol{{1}, {1, 2}, {2, 3}}; - - // Remove duplicates with KEEP_FIRST. - { - auto const expected_vals = ListsCol{{1}, {1, 6}, {1, 5}}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_FIRST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_LAST. - { - auto const expected_vals = ListsCol{{8}, {5, 8}, {4, 8}}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_NONE. - { - auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_NONE); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); - } - } - - // Sliced list column. - auto const lists_original = - ListsCol{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; - auto const lists1 = cudf::slice(lists_original, {0, 5})[0]; - auto const lists2 = cudf::slice(lists_original, {1, 5})[0]; - auto const lists3 = cudf::slice(lists_original, {1, 3})[0]; - auto const lists4 = cudf::slice(lists_original, {0, 3})[0]; - - { - auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists_original}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - { - auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - { - auto const expected = ListsCol{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - { - auto const expected = ListsCol{{1, 2, 3, 4}, {5}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - { - auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists4}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } -} - -TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests) -{ - using ListsCol = cudf::test::lists_column_wrapper; - auto constexpr null = TypeParam{0}; - - // null entries and lists. - { - auto const keys = ListsCol{{{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, - nulls_at({2, 3})}; - auto const vals = - ListsCol{{ListsCol{{1, 2, null, 4, 5}, null_at(2)}, {1}, {}, {} /*NULL*/, {1, 2, 3}, {1, 2}}, - null_at(3)}; - auto const expected_keys = - ListsCol{{{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})}; - - // Remove duplicates with KEEP_FIRST. - { - auto const expected_vals = - ListsCol{{ListsCol{{null, 2, 1, 4}, null_at(0)}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}}, - null_at(3)}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_FIRST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_LAST. - { - auto const expected_vals = - ListsCol{{ListsCol{5, 2, 1, 4}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}}, null_at(3)}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_NONE. - { - auto const expected_keys = - ListsCol{{{2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})}; - auto const expected_vals = - ListsCol{{ListsCol{2, 1, 4}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}}, null_at(3)}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_NONE); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_vals->view(), expected_vals, verbosity); - } - } - - // null entries are equal. - { - auto const keys = - ListsCol{{null, 1, null, 3, null, 5, null, 7, null, 9}, nulls_at({0, 2, 4, 6, 8})}; - auto const vals = ListsCol{{null, 1, 2, 3, 4, null, 6, 7, 8, null}, nulls_at({0, 5, 9})}; - auto const expected_keys = ListsCol{{1, 3, 5, 7, 9, null}, null_at(5)}; - - // Remove duplicates with KEEP_FIRST. - { - auto const expected_vals = ListsCol{{1, 3, null, 7, null, null}, nulls_at({2, 4, 5})}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_FIRST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_LAST. - { - auto const expected_vals = ListsCol{{1, 3, null, 7, null, 8}, nulls_at({2, 4})}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - - // Remove duplicates with KEEP_NONE. - { - auto const expected_keys = ListsCol{1, 3, 5, 7, 9}; - auto const expected_vals = ListsCol{{1, 3, null, 7, null}, nulls_at({2, 4})}; - auto const [results_keys, results_vals] = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys}, - cudf::lists_column_view{vals}, - cudf::duplicate_keep_option::KEEP_NONE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_keys->view(), expected_keys, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity); - } - } - - // null entries are not equal. - { - auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; - auto const expected = - ListsCol{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, - nulls_at({5, 6, 7, 8, 9})}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}, - cudf::null_equality::UNEQUAL); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } -} - -TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsNoNull) -{ - using ColWrapper = cudf::test::fixed_width_column_wrapper; - - auto const get_structs = [] { - auto child1 = ColWrapper{ - 1, 1, 1, 1, 1, 1, 1, 1, // list1 - 1, 1, 1, 1, 2, 1, 2, 2, // list2 - 2, 2, 2, 2, 3, 2, 3, 3 // list3 - }; - auto child2 = StringsCol{ - // begin list1 - "Banana", - "Mango", - "Apple", - "Cherry", - "Kiwi", - "Banana", - "Cherry", - "Kiwi", // end list1 - // begin list2 - "Bear", - "Duck", - "Cat", - "Dog", - "Panda", - "Bear", - "Cat", - "Panda", // end list2 - // begin list3 - "ÁÁÁ", - "ÉÉÉÉÉ", - "ÍÍÍÍÍ", - "ÁBC", - "XYZ", - "ÁÁÁ", - "ÁBC", - "XYZ" // end list3 - }; - return StructsCol{{child1, child2}}; - }; - - auto const get_structs_expected = [] { - auto child1 = ColWrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3}; - auto child2 = StringsCol{ - // begin list1 - "Apple", - "Banana", - "Cherry", - "Kiwi", - "Mango", // end list1 - // begin list2 - "Bear", - "Cat", - "Dog", - "Duck", - "Cat", - "Panda", // end list2 - // begin list3 - "ÁBC", - "ÁÁÁ", - "ÉÉÉÉÉ", - "ÍÍÍÍÍ", - "XYZ", - "ÁBC" // end list3 - }; - return StructsCol{{child1, child2}}; - }; - - // Test full columns. - { - auto const lists = - cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); - auto const expected = cudf::make_lists_column( - 3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {}); - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); - } - - // Test sliced columns. - { - auto const lists_original = - cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); - auto const expected_original = cudf::make_lists_column( - 3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {}); - auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; - auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } -} - -TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull) -{ - using ColWrapper = cudf::test::fixed_width_column_wrapper; - auto constexpr XXX = int32_t{0}; // nulls at the parent structs column level - auto constexpr null = int32_t{0}; // nulls at the children columns level - - auto const get_structs = [] { - auto child1 = ColWrapper{{ - 1, 1, null, XXX, XXX, 1, 1, 1, // list1 - 1, 1, 1, 1, 2, 1, null, 2, // list2 - null, null, 2, 2, 3, 2, 3, 3 // list3 - }, - nulls_at({2, 14, 16, 17})}; - auto child2 = StringsCol{{ - // begin list1 - "Banana", - "Mango", - "Apple", - "XXX", /*NULL*/ - "XXX", /*NULL*/ - "Banana", - "Cherry", - "Kiwi", // end list1 - // begin list2 - "Bear", - "Duck", - "Cat", - "Dog", - "Panda", - "Bear", - "" /*NULL*/, - "Panda", // end list2 - // begin list3 - "ÁÁÁ", - "ÉÉÉÉÉ", - "ÍÍÍÍÍ", - "ÁBC", - "" /*NULL*/, - "ÁÁÁ", - "ÁBC", - "XYZ" // end list3 - }, - nulls_at({14, 20})}; - return StructsCol{{child1, child2}, nulls_at({3, 4})}; - }; - - auto const get_structs_expected = [] { - auto child1 = - ColWrapper{{1, 1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null}, - nulls_at({4, 5, 11, 18, 19})}; - auto child2 = StringsCol{{ - // begin list1 - "Banana", - "Cherry", - "Kiwi", - "Mango", - "Apple", - "XXX" /*NULL*/, // end list1 - // begin list2 - "Bear", - "Cat", - "Dog", - "Duck", - "Panda", - "" /*NULL*/, // end list2 - // begin list3 - "ÁBC", - "ÁÁÁ", - "ÍÍÍÍÍ", - "XYZ", - "ÁBC", - "" /*NULL*/, - "ÁÁÁ", - "ÉÉÉÉÉ" // end list3 - }, - nulls_at({5, 11, 17})}; - return StructsCol{{child1, child2}, null_at(5)}; - }; - - // Test full columns. - { - auto const lists = - cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); - auto const expected = cudf::make_lists_column( - 3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {}); - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); - } - - // Test sliced columns. - { - auto const lists_original = - cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); - auto const expected_original = cudf::make_lists_column( - 3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {}); - auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; - auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } -} - -TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull) -{ - using ColWrapper = cudf::test::fixed_width_column_wrapper; - auto constexpr null = int32_t{0}; // nulls at the children columns level - // XXX and YYY are int placeholders for nulls at parent structs column level. - // We bring up two placeholders of different values to create intra null structs with - // children of different values, so as to test whether null_equality::EQUAL works or not. - auto constexpr XXX = int32_t{5}; - auto constexpr YYY = int32_t{6}; - - auto const get_nested_structs = [] { - auto grandchild1 = ColWrapper{{ - 1, XXX, null, XXX, YYY, 1, 1, 1, // list1 - 1, 1, 1, 1, 2, 1, null, 2, // list2 - null, null, 2, 2, 3, 2, 3, 3 // list3 - }, - nulls_at({2, 14, 16, 17})}; - auto grandchild2 = StringsCol{{ - // begin list1 - "Banana", - "YYY", /*NULL*/ - "Apple", - "XXX", /*NULL*/ - "YYY", /*NULL*/ - "Banana", - "Cherry", - "Kiwi", // end list1 - // begin list2 - "Bear", - "Duck", - "Cat", - "Dog", - "Panda", - "Bear", - "" /*NULL*/, - "Panda", // end list2 - // begin list3 - "ÁÁÁ", - "ÉÉÉÉÉ", - "ÍÍÍÍÍ", - "ÁBC", - "" /*NULL*/, - "ÁÁÁ", - "ÁBC", - "XYZ" // end list3 - }, - nulls_at({14, 20})}; - auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({1, 3, 4})}; - return StructsCol{{child1}}; - }; - - auto const get_nested_struct_expected = [] { - auto grandchild1 = - ColWrapper{{1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null}, - nulls_at({3, 4, 10, 17, 18})}; - auto grandchild2 = StringsCol{{ - // begin list1 - "Banana", - "Cherry", - "Kiwi", - "Apple", - "XXX" /*NULL*/, // end list1 - // begin list2 - "Bear", - "Cat", - "Dog", - "Duck", - "Panda", - "" /*NULL*/, // end list2 - // begin list3 - "ÁBC", - "ÁÁÁ", - "ÍÍÍÍÍ", - "XYZ", - "ÁBC", - "" /*NULL*/, - "ÁÁÁ", - "ÉÉÉÉÉ" // end list3 - }, - nulls_at({4, 10, 16})}; - auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({4})}; - return StructsCol{{child1}}; - }; - - // Test full columns. - { - auto const lists = cudf::make_lists_column( - 3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {}); - auto const expected = cudf::make_lists_column( - 3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {}); - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); - } - - // Test sliced columns. - { - auto const lists_original = cudf::make_lists_column( - 3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {}); - auto const expected_original = cudf::make_lists_column( - 3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {}); - auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; - auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } -} - -TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs) -{ - auto const h_child = std::vector{ - 0, -1, 1, 0, 2, 0, 1, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; - - auto const get_structs = [&] { - // Two children are just identical. - auto child1 = FloatsCol(h_child.begin(), h_child.end()); - auto child2 = FloatsCol(h_child.begin(), h_child.end()); - return StructsCol{{child1, child2}}; - }; - - // The first list does not have any NaN or -NaN, while the second list has both. - // `drop_list_duplicates` is expected to operate properly on this second list. - auto const lists_original = - cudf::make_lists_column(2, IntsCol{0, 10, 18}.release(), get_structs().release(), 0, {}); - auto const lists2 = cudf::slice(lists_original->view(), {1, 2})[0]; // test on the second list - - // Contain expected vals excluding NaN. - auto const results_children_expected = std::unordered_set{0, 1, 2}; - - // Test for cudf::nan_equality::UNEQUAL. - { - auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2}); - auto const child = cudf::lists_column_view(results_col->view()).child(); - auto const results_arr = cudf::test::to_host(child.child(0)).first; - - std::size_t const num_NaNs = - std::count_if(h_child.begin(), h_child.end(), [](auto x) { return std::isnan(x); }); - EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs); - - std::size_t NaN_count{0}; - std::unordered_set results; - for (auto const x : results_arr) { - if (std::isnan(x)) { - ++NaN_count; - } else { - results.insert(x); - } - } - EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs); - } - - // Test for cudf::nan_equality::ALL_EQUAL. - { - auto const results_col = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{lists2}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); - auto const child = cudf::lists_column_view(results_col->view()).child(); - auto const results_arr = cudf::test::to_host(child.child(0)).first; - - std::size_t const num_NaNs = 1; - EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs); - - std::size_t NaN_count{0}; - std::unordered_set results; - for (auto const x : results_arr) { - if (std::isnan(x)) { - ++NaN_count; - } else { - results.insert(x); - } - } - EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs); - } -}