diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index 316fe4614b3..53a6a3613b9 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -50,6 +50,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) {0}, cudf::duplicate_keep_option::KEEP_ANY, cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL, stream_view); }); } @@ -90,8 +91,12 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::distinct( - *table, {0}, cudf::duplicate_keep_option::KEEP_ANY, cudf::null_equality::EQUAL, stream_view); + auto result = cudf::detail::distinct(*table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL, + stream_view); }); } diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index f8a745dad86..8ba7b0cb996 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -85,6 +85,7 @@ std::unique_ptr distinct( std::vector const& keys, duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -95,17 +96,19 @@ std::unique_ptr
distinct( * generated. If there are duplicate rows, which index is kept depends on the `keep` parameter. * * @param input The input table - * @param keep Get index of the first, last, any, or none row among the found duplicates rows + * @param keep Get index of any, first, last, or none of the found duplicates * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned vector * @return A device_uvector containing the result indices */ rmm::device_uvector get_distinct_indices( table_view const& input, - duplicate_keep_option keep, - null_equality nulls_equal, - rmm::cuda_stream_view stream, + duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index b45b3958c35..f071dcc8f70 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -263,8 +263,8 @@ std::unique_ptr
unique( * @param[in] input input table_view to copy only distinct rows * @param[in] keys vector of indices representing key columns from `input` * @param[in] keep keep any, first, last, or none of the found duplicates - * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not - * equal if null_equality::UNEQUAL + * @param[in] nulls_equal flag to control if nulls are compared equal or not + * @param[in] nans_equal flag to control if floating-point NaN values are compared equal or not * @param[in] mr Device memory resource used to allocate the returned table's device * memory * @@ -275,6 +275,7 @@ std::unique_ptr
distinct( std::vector const& keys, duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 927f913f0c8..3dea491b6e4 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -64,6 +64,7 @@ std::unique_ptr add_keys( std::vector{0}, // only one key column duplicate_keep_option::KEEP_ANY, null_equality::EQUAL, + nan_equality::ALL_EQUAL, stream, mr); std::vector column_order{order::ASCENDING}; diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index 47ba04cf3da..d4f3a9ca495 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -227,6 +227,7 @@ std::unique_ptr concatenate(host_span columns, std::vector{0}, duplicate_keep_option::KEEP_ANY, null_equality::EQUAL, + nan_equality::ALL_EQUAL, stream, mr); auto sorted_keys = cudf::detail::sort(table_keys->view(), diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index f9656a74e12..216f00c90e1 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -132,6 +132,7 @@ std::unique_ptr set_keys( std::vector{0}, duplicate_keep_option::KEEP_ANY, null_equality::EQUAL, + nan_equality::ALL_EQUAL, stream, mr); auto sorted_keys = cudf::detail::sort(distinct_keys->view(), diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index 641dcf084c5..a03e4c4441a 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -39,6 +39,7 @@ namespace detail { rmm::device_uvector get_distinct_indices(table_view const& input, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -59,12 +60,24 @@ rmm::device_uvector get_distinct_indices(table_view const& input, auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); - auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal); + auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); auto const pair_iter = cudf::detail::make_counting_transform_iterator( size_type{0}, [] __device__(size_type const i) { return cuco::make_pair(i, i); }); - map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); + + auto const insert_keys = [&](auto const value_comp) { + auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); + map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); + }; + + if (nans_equal == nan_equality::ALL_EQUAL) { + using nan_equal_comparator = + cudf::experimental::row::equality::nan_equal_physical_equality_comparator; + insert_keys(nan_equal_comparator{}); + } else { + using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; + insert_keys(nan_unequal_comparator{}); + } auto output_indices = rmm::device_uvector(map.get_size(), stream, mr); @@ -74,9 +87,15 @@ rmm::device_uvector get_distinct_indices(table_view const& input, return output_indices; } - // For other keep options, perform a (sparse) reduce-by-row on the rows compared equal. - auto const reduction_results = hash_reduce_by_row( - map, std::move(preprocessed_input), input.num_rows(), has_nulls, keep, nulls_equal, stream); + // For other keep options, reduce by row on rows that compare equal. + auto const reduction_results = hash_reduce_by_row(map, + std::move(preprocessed_input), + input.num_rows(), + has_nulls, + keep, + nulls_equal, + nans_equal, + stream); // Extract the desired output indices from reduction results. auto const map_end = [&] { @@ -111,6 +130,7 @@ std::unique_ptr
distinct(table_view const& input, std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -118,7 +138,8 @@ std::unique_ptr
distinct(table_view const& input, return empty_like(input); } - auto const gather_map = get_distinct_indices(input.select(keys), keep, nulls_equal, stream); + auto const gather_map = + get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream); return detail::gather(input, gather_map, out_of_bounds_policy::DONT_CHECK, @@ -133,10 +154,12 @@ std::unique_ptr
distinct(table_view const& input, std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::distinct(input, keys, keep, nulls_equal, cudf::default_stream_value, mr); + return detail::distinct( + input, keys, keep, nulls_equal, nans_equal, cudf::default_stream_value, mr); } } // namespace cudf diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu index 50bfa3c4114..468561273b3 100644 --- a/cpp/src/stream_compaction/distinct_reduce.cu +++ b/cpp/src/stream_compaction/distinct_reduce.cu @@ -95,6 +95,7 @@ rmm::device_uvector hash_reduce_by_row( cudf::nullate::DYNAMIC has_nulls, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -111,15 +112,26 @@ rmm::device_uvector hash_reduce_by_row( auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); - auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal); - - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - reduce_by_row_fn{ - map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()}); + auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); + + auto const reduce_by_row = [&](auto const value_comp) { + auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + reduce_by_row_fn{ + map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()}); + }; + + if (nans_equal == nan_equality::ALL_EQUAL) { + using nan_equal_comparator = + cudf::experimental::row::equality::nan_equal_physical_equality_comparator; + reduce_by_row(nan_equal_comparator{}); + } else { + using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; + reduce_by_row(nan_unequal_comparator{}); + } return reduction_results; } diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_reduce.cuh index 7b0a4148110..c8a0c2869c8 100644 --- a/cpp/src/stream_compaction/distinct_reduce.cuh +++ b/cpp/src/stream_compaction/distinct_reduce.cuh @@ -78,6 +78,7 @@ rmm::device_uvector hash_reduce_by_row( cudf::nullate::DYNAMIC has_nulls, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 5a8e707c6bd..2644d8a8e48 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -47,8 +47,13 @@ std::pair, std::unique_ptr> encode( std::vector drop_keys(num_cols); std::iota(drop_keys.begin(), drop_keys.end(), 0); - auto distinct_keys = cudf::detail::distinct( - input_table, drop_keys, duplicate_keep_option::KEEP_ANY, null_equality::EQUAL, stream, mr); + auto distinct_keys = cudf::detail::distinct(input_table, + drop_keys, + duplicate_keep_option::KEEP_ANY, + null_equality::EQUAL, + nan_equality::ALL_EQUAL, + stream, + mr); std::vector column_order(num_cols, order::ASCENDING); std::vector null_precedence(num_cols, null_order::AFTER); diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 5635822a0bc..3622f6400cb 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -32,12 +31,15 @@ auto constexpr null{0}; // null at current level auto constexpr XXX{0}; // null pushed down from parent level +auto constexpr NaN = std::numeric_limits::quiet_NaN(); auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; auto constexpr KEEP_LAST = cudf::duplicate_keep_option::KEEP_LAST; auto constexpr KEEP_NONE = cudf::duplicate_keep_option::KEEP_NONE; auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; +auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; +auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; using int32s_col = cudf::test::fixed_width_column_wrapper; using floats_col = cudf::test::fixed_width_column_wrapper; @@ -176,6 +178,46 @@ TEST_F(DistinctKeepAny, NoNullsTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); } +TEST_F(DistinctKeepAny, NoNullsTableWithNaNs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows for same keys. + auto const col1 = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5}; + auto const col2 = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4}; + auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9}; + auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.}; + + auto const input = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const key_idx = std::vector{2, 3}; + + // NaNs are unequal. + { + auto const exp_col1_sort = int32s_col{5, 1, 1, 1, 5, 6, 3, 8}; + auto const exp_col2_sort = floats_col{4, 1, 1, 1, 4, 6, 3, 9}; + auto const exp_keys1_sort = int32s_col{9, 15, 15, 15, 19, 20, 20, 21}; + auto const exp_keys2_sort = floats_col{21., NaN, NaN, NaN, 20., 19., 20., 9.}; + auto const expected_sort = + cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select(key_idx)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // NaNs are equal. + { + auto const exp_col1_sort = int32s_col{5, 1, 5, 6, 3, 8}; + auto const exp_col2_sort = floats_col{4, 1, 4, 6, 3, 9}; + auto const exp_keys1_sort = int32s_col{9, 15, 19, 20, 20, 21}; + auto const exp_keys2_sort = floats_col{21., NaN, 20., 19., 20., 9.}; + auto const expected_sort = + cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select(key_idx)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } +} + TEST_F(DistinctKeepFirstLastNone, NoNullsTable) { // Column(s) used to test needs to have different rows for the same keys. @@ -345,7 +387,64 @@ TEST_F(DistinctKeepAny, InputWithNulls) auto const exp_keys_sort = int32s_col{{null, null, 19, 20, 21}, nulls_at({0, 1})}; auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select(key_idx)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } +} + +TEST_F(DistinctKeepAny, InputWithNullsAndNaNs) +{ + auto constexpr null{0.0}; // shadow the global `null` variable of type int + + // Column(s) used to test KEEP_ANY needs to have same rows for same keys because KEEP_ANY is + // nondeterministic. + auto const col = int32s_col{5, 4, 1, 1, 1, 4, 1, 8, 1}; + auto const keys = floats_col{{20., null, NaN, NaN, NaN, null, 19., 21., 19.}, nulls_at({1, 5})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal, NaNs are unequal. + { + auto const exp_col_sort = int32s_col{4, 1, 5, 8, 1, 1, 1}; + auto const exp_keys_sort = floats_col{{null, 19., 20., 21., NaN, NaN, NaN}, null_at(0)}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select(key_idx)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // Nulls are equal, NaNs are equal. + { + auto const exp_col_sort = int32s_col{4, 1, 5, 8, 1}; + auto const exp_keys_sort = floats_col{{null, 19., 20., 21., NaN}, null_at(0)}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select(key_idx)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // Nulls are unequal, NaNs are unequal. + { + auto const exp_col_sort = int32s_col{4, 4, 1, 5, 8, 1, 1, 1}; + auto const exp_keys_sort = + floats_col{{null, null, 19., 20., 21., NaN, NaN, NaN}, nulls_at({0, 1})}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select(key_idx)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // Nulls are unequal, NaNs are equal. + { + auto const exp_col_sort = int32s_col{4, 4, 1, 5, 8, 1}; + auto const exp_keys_sort = floats_col{{null, null, 19., 20., 21., NaN}, nulls_at({0, 1})}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select(key_idx)); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); } @@ -435,6 +534,90 @@ TEST_F(DistinctKeepFirstLastNone, InputWithNullsUnequal) } } +TEST_F(DistinctKeepFirstLastNone, InputWithNaNsEqual) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const keys = floats_col{20., NaN, NaN, 19., 21., 19., 22.}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col_sort = int32s_col{0, 1, 3, 4, 6}; + auto const exp_keys_sort = floats_col{20., NaN, 19., 21., 22.}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select({0})); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // KEEP_LAST + { + auto const exp_col_sort = int32s_col{0, 2, 4, 5, 6}; + auto const exp_keys_sort = floats_col{20., NaN, 21., 19., 22.}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select({0})); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // KEEP_NONE + { + auto const exp_col_sort = int32s_col{0, 4, 6}; + auto const exp_keys_sort = floats_col{20., 21., 22.}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select({0})); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } +} + +TEST_F(DistinctKeepFirstLastNone, InputWithNaNsUnequal) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6, 7}; + auto const keys = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col_sort = int32s_col{0, 1, 2, 3, 4, 6}; + auto const exp_keys_sort = floats_col{20., NaN, NaN, 19., 21., 22.}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select({0})); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // KEEP_LAST + { + auto const exp_col_sort = int32s_col{1, 2, 4, 5, 6, 7}; + auto const exp_keys_sort = floats_col{NaN, NaN, 21., 19., 22., 20.}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select({0})); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } + + // KEEP_NONE + { + auto const exp_col_sort = int32s_col{1, 2, 4, 6}; + auto const exp_keys_sort = floats_col{NaN, NaN, 21., 22.}; + auto const expected_sort = cudf::table_view{{exp_col_sort, exp_keys_sort}}; + + auto const result = cudf::distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL); + auto const result_sort = cudf::sort_by_key(*result, result->select({0})); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); + } +} + TEST_F(DistinctKeepAny, BasicLists) { // Column(s) used to test KEEP_ANY needs to have same rows for same keys because KEEP_ANY is @@ -550,7 +733,7 @@ TEST_F(DistinctKeepAny, NullableLists) lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/, {} /*NULL*/}, nulls_at({4, 5})}; auto const expected_sort = cudf::table_view{{exp_idx_sort, exp_keys_sort}}; - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select({0})); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort); } @@ -728,7 +911,7 @@ TEST_F(DistinctKeepAny, ListsOfStructs) auto const expect_map = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16}; auto const expect_table = cudf::gather(input, expect_map); - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select({0})); CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result_sort); } @@ -907,7 +1090,7 @@ TEST_F(DistinctKeepAny, SlicedListsOfStructs) auto const expect_map = int32s_col{8, 9, 10, 11, 13, 14}; auto const expect_table = cudf::gather(input_original, expect_map); - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select({0})); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expect_table, *result_sort); } @@ -973,7 +1156,7 @@ TEST_F(DistinctKeepAny, ListsOfEmptyStructs) auto const expect_map = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 11}; auto const expect_table = cudf::gather(input, expect_map); - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select({0})); CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result_sort); } @@ -1010,7 +1193,7 @@ TEST_F(DistinctKeepAny, EmptyDeepList) auto const expect_map = int32s_col{0, 2, 3}; auto const expect_table = cudf::gather(input, expect_map); - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select({0})); CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result_sort); } @@ -1063,7 +1246,7 @@ TEST_F(DistinctKeepAny, StructsOfStructs) auto const expect_map = int32s_col{0, 1, 2, 3, 7, 4, 5, 8}; auto const expect_table = cudf::gather(input, expect_map); - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select({0})); CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result_sort); } @@ -1117,7 +1300,7 @@ TEST_F(DistinctKeepAny, SlicedStructsOfStructs) auto const expect_map = int32s_col{6, 1, 2, 3, 4, 5}; auto const expect_table = cudf::gather(input_original, expect_map); - auto const result = cudf::distinct(input, key_idx, KEEP_ANY, null_equality::UNEQUAL); + auto const result = cudf::distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); auto const result_sort = cudf::sort_by_key(*result, result->select({0})); CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result_sort); }