diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 3635336e8..3617426f7 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -83,3 +83,18 @@ ConfigureBench(DYNAMIC_MAP_BENCH # - hash function benchmarks ---------------------------------------------------------------------- ConfigureBench(HASH_BENCH hash_bench.cu) + +################################################################################################### +# - dynamic_bitset benchmarks ------------------------------------------------------------------------- +ConfigureBench(DYNAMIC_BITSET_BENCH + trie/dynamic_bitset/find_next_bench.cu + trie/dynamic_bitset/rank_bench.cu + trie/dynamic_bitset/select_bench.cu + trie/dynamic_bitset/size_bench.cu + trie/dynamic_bitset/test_bench.cu) + +################################################################################################### +# - trie benchmarks ------------------------------------------------------------------------- +ConfigureBench(TRIE_BENCH + trie/insert_bench.cu + trie/lookup_bench.cu) diff --git a/benchmarks/trie/dynamic_bitset/find_next_bench.cu b/benchmarks/trie/dynamic_bitset/find_next_bench.cu new file mode 100644 index 000000000..c6d38745c --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/find_next_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::find_next` performance + */ +template +void dynamic_bitset_find_next(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.find_next(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_find_next, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_find_next") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/rank_bench.cu b/benchmarks/trie/dynamic_bitset/rank_bench.cu new file mode 100644 index 000000000..6b41cf029 --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/rank_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::rank` performance + */ +template +void dynamic_bitset_rank(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.rank(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_rank, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_rank") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/select_bench.cu b/benchmarks/trie/dynamic_bitset/select_bench.cu new file mode 100644 index 000000000..755018ea3 --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/select_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::select` performance + */ +template +void dynamic_bitset_select(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.select(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_select, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_select") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/size_bench.cu b/benchmarks/trie/dynamic_bitset/size_bench.cu new file mode 100644 index 000000000..d31ecebbd --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/size_bench.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::size` performance + */ +template +void dynamic_bitset_size(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + state.add_element_count(1); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const size = bitset.size(); }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_size, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_size") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/test_bench.cu b/benchmarks/trie/dynamic_bitset/test_bench.cu new file mode 100644 index 000000000..46109c146 --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/test_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::test` performance + */ +template +void dynamic_bitset_test(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.test(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_test, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_test") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu new file mode 100644 index 000000000..90c243ed0 --- /dev/null +++ b/benchmarks/trie/insert_bench.cu @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include <../tests/trie/utils.hpp> +#include +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::trie::insert` performance + */ +template +void trie_insert(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumKeys", 100 * 1000); + auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10); + + cuco::experimental::trie trie; + + thrust::host_vector labels; + thrust::host_vector offsets; + + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); + + state.add_element_count(num_keys); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + for (auto& key : keys) { + trie.insert(key.begin(), key.end()); + } + }); +} + +NVBENCH_BENCH_TYPES(trie_insert, NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("trie_insert") + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}) + .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}); diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu new file mode 100644 index 000000000..55a30b168 --- /dev/null +++ b/benchmarks/trie/lookup_bench.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include <../tests/trie/utils.hpp> +#include +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::trie::lookup` performance + */ +template +void trie_lookup(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumKeys", 100 * 1000); + auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10); + + cuco::experimental::trie trie; + + thrust::host_vector labels; + thrust::host_vector offsets; + + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); + + for (auto key : keys) { + trie.insert(key.begin(), key.end()); + } + trie.build(); + + const size_t query_size = min(1000 * 1000lu, num_keys / 10); + thrust::device_vector inputs(labels.begin(), labels.begin() + offsets[query_size]); + thrust::device_vector d_offsets(offsets.begin(), offsets.begin() + query_size); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + trie.lookup(inputs.begin(), d_offsets.begin(), d_offsets.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(trie_lookup, NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("trie_lookup") + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}) + .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d78627eee..5b0ff128e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -40,3 +40,5 @@ ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") +ConfigureExample(TRIE_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/trie/host_bulk_example.cu") +ConfigureExample(TRIE_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/trie/device_ref_example.cu") diff --git a/examples/trie/device_ref_example.cu b/examples/trie/device_ref_example.cu new file mode 100644 index 000000000..d26e0524c --- /dev/null +++ b/examples/trie/device_ref_example.cu @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <../tests/trie/utils.hpp> +#include + +#include +#include +#include +#include + +using namespace cuco::utility; + +template +__global__ void lookup_kernel( + TrieRef ref, LabelIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const loop_stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + auto key_start_pos = keys + offsets[key_id]; + auto key_length = offsets[key_id + 1] - offsets[key_id]; + + outputs[key_id] = ref.lookup(key_start_pos, key_length); + key_id += loop_stride; + } +} + +/** + * @file device_ref_example.cu + * @brief Demonstrates usage of the trie device-side APIs. + * + * trie provides a non-owning reference which can be used to interact with + * the container from within device code. + * + */ +int main(void) +{ + using LabelType = int; + + std::size_t num_keys = 64 * 1024; + std::size_t max_key_length = 6; + + thrust::host_vector labels; + thrust::host_vector offsets; + + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); + + cuco::experimental::trie trie; + for (auto key : keys) { + trie.insert(key.begin(), key.end()); + } + trie.build(); + + thrust::device_vector d_labels = labels; + thrust::device_vector d_offsets = offsets; + thrust::device_vector result(num_keys, -1lu); + + trie_lookup_kernel<<<128, 128>>>(trie.ref(cuco::experimental::trie_lookup), + d_labels.begin(), + d_offsets.begin(), + result.begin(), + num_keys); + + bool const all_keys_found = + thrust::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys)); + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/examples/trie/host_bulk_example.cu b/examples/trie/host_bulk_example.cu new file mode 100644 index 000000000..6cf6ac7ef --- /dev/null +++ b/examples/trie/host_bulk_example.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <../tests/trie/utils.hpp> +#include + +#include +#include +#include +#include + +using namespace cuco::utility; + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the trie "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or + * `lookup` on a set of keys. + * + */ +int main(void) +{ + using LabelType = int; + + std::size_t num_keys = 64 * 1024; + std::size_t max_key_length = 6; + + thrust::host_vector labels; + thrust::host_vector offsets; + + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); + + cuco::experimental::trie trie; + for (auto key : keys) { + trie.insert(key.begin(), key.end()); + } + trie.build(); + + thrust::device_vector d_labels = labels; + thrust::device_vector d_offsets = offsets; + thrust::device_vector result(num_keys, -1lu); + + trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin()); + + bool const all_keys_found = + thrust::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys)); + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 8383669fc..06d845045 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -102,6 +102,16 @@ class dynamic_bitset { */ constexpr dynamic_bitset(Allocator const& allocator = Allocator{}); + /** + * @brief Inserts words in the range [word_begin, word_end) + * + * @param words_begin Begin iterator to words list + * @param words_end End iterator to words list + * @param n_bits Number of bits to be inserted + */ + template + constexpr void insert(WordIt words_begin, WordIt words_end, size_type n_bits); + /** * @brief Appends the given element `value` to the end of the bitset * @@ -146,6 +156,26 @@ class dynamic_bitset { OutputIt outputs_begin, cuda_stream_ref stream = {}) noexcept; + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores + * position of first set bit including or after position `keys_begin[i]`, to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to list of positions to be queried + * @param keys_end End iterator to positions list + * @param outputs_begin Begin iterator to outputs of find_next operation + * @param stream Stream to execute find_next kernel + */ + template + constexpr void find_next(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`. diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index d56ef9d7c..34d6c9a70 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -44,6 +44,21 @@ constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) { } +template +template +constexpr void dynamic_bitset::insert(WordIt words_begin, + WordIt words_end, + size_type n_bits) +{ + if (n_bits == 0) { return; } + size_t num_blocks = (n_bits - 1) / bits_per_block + 1; + assert(num_blocks == cuco::detail::distance(words_begin, words_end)); + + words_.reserve(num_blocks); + words_.insert(words_.end(), words_begin, words_end); + n_bits_ = n_bits; +} + template constexpr void dynamic_bitset::push_back(bool bit) noexcept { @@ -82,6 +97,7 @@ constexpr void dynamic_bitset::test(KeyIt keys_begin, { build(); + if (n_bits_ == 0) { return; } auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } @@ -91,6 +107,25 @@ constexpr void dynamic_bitset::test(KeyIt keys_begin, ref(), keys_begin, outputs_begin, num_keys); } +template +template +constexpr void dynamic_bitset::find_next(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + if (n_bits_ == 0) { return; } + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_find_next_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + template template constexpr void dynamic_bitset::rank(KeyIt keys_begin, diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index c92ab60b2..b67ba96ee 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -55,6 +55,35 @@ __global__ void bitset_test_kernel(BitsetRef ref, } } +/* + * @brief Compute position of next set bit for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_find_next_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.find_next(keys[key_id]); + key_id += stride; + } +} + /* * @brief Gather rank values for a range of keys * diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl new file mode 100644 index 000000000..ca202d236 --- /dev/null +++ b/include/cuco/detail/trie/trie.inl @@ -0,0 +1,215 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { + +template +constexpr trie::trie(Allocator const& allocator) + : allocator_{allocator}, + num_keys_{0}, + num_nodes_{1}, + last_key_{}, + num_levels_{2}, + levels_{2}, + d_levels_ptr_{nullptr}, + device_ptr_{nullptr} +{ + levels_[0].h_louds_.push_back(0); + levels_[0].h_louds_.push_back(1); + levels_[1].h_louds_.push_back(1); + levels_[0].h_outs_.push_back(0); + levels_[0].h_labels_.push_back(root_label_); +} + +template +trie::~trie() noexcept(false) +{ + if (d_levels_ptr_) { CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); } + if (device_ptr_) { CUCO_CUDA_TRY(cudaFree(device_ptr_)); } +} + +template +template +void trie::insert(LabelIt labels_begin, LabelIt labels_end) noexcept +{ + size_t key_length = std::distance(labels_begin, labels_end); + + bool same_as_last_key = key_length == last_key_.size(); + for (size_t pos = 0; same_as_last_key && pos < last_key_.size(); pos++) { + if (labels_begin[pos] != last_key_[pos]) { same_as_last_key = false; } + } + if (same_as_last_key) { return; } // Ignore duplicate keys + // assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order + + if (key_length == 0) { + levels_[0].h_outs_.set(0, 1); + ++levels_[1].offset_; + ++num_keys_; + return; + } + + if (key_length + 1 >= levels_.size()) { levels_.resize(key_length + 2); } + + // Find first position where label is different from last_key + // Trie is not updated till that position is reached, simply skip to next position + size_type pos = 0; + for (; pos < key_length; ++pos) { + auto& level = levels_[pos + 1]; + auto label = labels_begin[pos]; + + if (pos == last_key_.size() || label != level.h_labels_.back()) { + level.h_louds_.set_last(0); + level.h_louds_.push_back(1); + level.h_outs_.push_back(0); + level.h_labels_.push_back(label); + ++num_nodes_; + break; + } + } + + // Process remaining labels after divergence point from last_key + // Each such label will create a new edge and node pair + for (++pos; pos < key_length; ++pos) { + auto& level = levels_[pos + 1]; + level.h_louds_.push_back(0); + level.h_louds_.push_back(1); + level.h_outs_.push_back(0); + level.h_labels_.push_back(labels_begin[pos]); + ++num_nodes_; + } + + levels_[key_length + 1].h_louds_.push_back(1); // Mark end of current key + ++levels_[key_length + 1].offset_; + levels_[key_length].h_outs_.set_last(1); // Set terminal bit indicating valid path + + ++num_keys_; + + last_key_.resize(key_length); + for (size_t pos = 0; pos < key_length; pos++) { + last_key_[pos] = labels_begin[pos]; + } +} + +template +void trie::build() noexcept(false) +{ + // Perform build level-by-level for all levels, followed by a deep-copy from host to device + size_type offset = 0; + + thrust::device_vector test_keys(1, 0); + thrust::device_vector test_results(1); + + for (auto& level : levels_) { + level.louds_.insert( + level.h_louds_.words_.begin(), level.h_louds_.words_.end(), level.h_louds_.n_bits_); + level.h_louds_.clear(); + + // Run host-bulk test on bitvectors to initiate internal build() + level.louds_.test(test_keys.begin(), test_keys.end(), test_results.begin()); + louds_refs_.push_back(level.louds_.ref()); + + level.outs_.insert( + level.h_outs_.words_.begin(), level.h_outs_.words_.end(), level.h_outs_.n_bits_); + level.h_outs_.clear(); + + level.outs_.test(test_keys.begin(), test_keys.end(), test_results.begin()); + outs_refs_.push_back(level.outs_.ref()); + + level.labels_ = level.h_labels_; + level.h_labels_.clear(); + level.labels_ptr_ = thrust::raw_pointer_cast(level.labels_.data()); + + offset += level.offset_; + level.offset_ = offset; + } + + louds_refs_ptr_ = thrust::raw_pointer_cast(louds_refs_.data()); + outs_refs_ptr_ = thrust::raw_pointer_cast(outs_refs_.data()); + + num_levels_ = levels_.size(); + + // Move levels to device + CUCO_CUDA_TRY(cudaMalloc(&d_levels_ptr_, sizeof(level) * num_levels_)); + CUCO_CUDA_TRY( + cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(level) * num_levels_, cudaMemcpyHostToDevice)); + + // Finally create a device copy of full trie structure + CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); + CUCO_CUDA_TRY( + cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); +} + +template +template +void trie::lookup(LabelIt labels_begin, + OffsetIt offsets_begin, + OffsetIt offsets_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept +{ + auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; + if (num_keys == 0) { return; } + + auto const grid_size = min(128lu, cuco::detail::grid_size(num_keys)); + auto ref_ = this->ref(cuco::experimental::trie_lookup); + + trie_lookup_kernel<<>>( + ref_, labels_begin, offsets_begin, outputs_begin, num_keys); +} + +template +__global__ void trie_lookup_kernel( + TrieRef ref, LabelIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const loop_stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + auto key_start_pos = keys + offsets[key_id]; + auto key_length = offsets[key_id + 1] - offsets[key_id]; + + outputs[key_id] = ref.lookup(key_start_pos, key_length); + key_id += loop_stride; + } +} + +template +template +auto trie::ref(Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{device_ptr_}; +} + +template +trie::level::level() + : h_louds_{}, + h_outs_{}, + louds_{}, + outs_{}, + labels_{}, + labels_ptr_{nullptr}, + h_labels_{}, + offset_{0} +{ +} + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl new file mode 100644 index 000000000..d7cb9427a --- /dev/null +++ b/include/cuco/detail/trie/trie_ref.inl @@ -0,0 +1,117 @@ +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr trie_ref::trie_ref( + const trie* trie) noexcept + : trie_{trie} +{ +} + +namespace detail { + +template +class operator_impl> { + using ref_type = trie_ref; + using size_type = size_t; + + public: + /** + * @brief Lookup a single key + * + * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's + * `LabelType` + * + * @param labels Iterator to first label of key + * @param length Number of labels in key + * + * @return Index of key if it exists in trie, -1 otherwise + */ + template + [[nodiscard]] __device__ size_type lookup(LabelIt labels, size_type length) const noexcept + { + auto const& trie = static_cast(*this).trie_; + + // Level-by-level search. node_id is updated at each level + size_type node_id = 0; + for (size_type cur_depth = 1; cur_depth <= length; cur_depth++) { + if (!search_label_in_children(labels[cur_depth - 1], node_id, cur_depth)) { return -1lu; } + } + + // Check for terminal node bit that indicates a valid key + size_type leaf_level_id = length; + if (!trie->outs_refs_ptr_[leaf_level_id].test(node_id)) { return -1lu; } + + // Key exists in trie, generate the index + auto offset = trie->d_levels_ptr_[leaf_level_id].offset_; + auto rank = trie->outs_refs_ptr_[leaf_level_id].rank(node_id); + + return offset + rank; + } + + private: + /** + * @brief Find position of last child of a node + * + * @tparam BitsetRef Device-accessible reference to bitset + * + * @param louds louds bitset of current level + * @param node_id node index in current level + * + * @return Position of last child + */ + template + [[nodiscard]] __device__ size_type last_child_position(BitsetRef louds, + size_type& node_id) const noexcept + { + size_type node_pos = 0; + if (node_id != 0) { + node_pos = louds.select(node_id - 1) + 1; + node_id = node_pos - node_id; + } + + auto pos_end = louds.find_next(node_pos); + return node_id + (pos_end - node_pos); + } + + /** + * @brief Search for a target label in children nodes of a parent node + * + * @param target Label to search for + * @param node_id Index of parent node + * @param level_id Index of current level + * + * @return Boolean indicating success of search process + */ + [[nodiscard]] __device__ bool search_label_in_children(LabelType target, + size_type& node_id, + size_type level_id) const noexcept + { + auto const& trie = static_cast(*this).trie_; + auto louds = trie->louds_refs_ptr_[level_id]; + + auto end = last_child_position(louds, node_id); // Position of last child + auto begin = node_id; // Position of first child, initialized after find_last_child call + + auto& level = trie->d_levels_ptr_[level_id]; + auto labels = level.labels_ptr_; + + // Binary search labels array of current level + while (begin < end) { + node_id = (begin + end) / 2; + auto label = labels[node_id]; + if (target < label) { + end = node_id; + } else if (target > label) { + begin = node_id + 1; + } else { + break; + } + } + return begin < end; + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index 77cf2c133..ca2953f40 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -51,6 +51,12 @@ struct contains_tag { struct find_tag { } inline constexpr find; +/** + * @brief `trie_lookup` operator tag + */ +struct trie_lookup_tag { +} inline constexpr trie_lookup; + } // namespace op } // namespace experimental } // namespace cuco diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh new file mode 100644 index 000000000..6b1837fde --- /dev/null +++ b/include/cuco/trie.cuh @@ -0,0 +1,208 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Trie class + * + * @tparam label_type type of individual elements of vector keys (eg. char or int) + * @tparam Allocator Type of allocator used for device storage + */ +template > +class trie { + public: + /** + * @brief Constructs an empty trie + * + * @param allocator Allocator used for allocating device storage + */ + constexpr trie(Allocator const& allocator = Allocator{}); + ~trie() noexcept(false); + + /** + * @brief Insert a single key into trie + * + * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's + * `LabelType` + * @param labels_begin Begin iterator to list of labels of input key + * @param labels_end End iterator to list of labels of input key + */ + template + void insert(LabelIt labels_begin, LabelIt labels_end) noexcept; + + /** + * @brief Build level-by-level trie indexes after inserting all keys + * + * In addition, a snapshot of current trie state is copied to device + */ + void build() noexcept(false); + + /** + * @brief For every pair (`offsets_begin[i]`, `offsets_begin[i + 1]`) in the range + * `[offsets_begin, offsets_end)`, checks if the key defined by labels in the range + * [`labels_begin[offsets_begin[i]]`, `labels_begin[offsets_begin[i + 1]]`) is present in trie. + * Stores the index of key if it exists in trie (-1 otherwise) in `outputs_begin[i]` + * + * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's + * `LabelType` + * @tparam OffsetIt Device-accessible iterator whose `value_type` can be converted to trie's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type + * + * @param labels_begin Begin iterator to labels list of all keys + * @param offsets_begin Begin iterator to offsets of key boundaries + * @param offsets_end End iterator to offsets + * @param outputs_begin Begin iterator to lookup results + * @param stream Stream to execute lookup kernel + */ + template + void lookup(LabelIt labels_begin, + OffsetIt offsets_begin, + OffsetIt offsets_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; + + using size_type = std::size_t; ///< size type + + /** + * @brief Get current size i.e. number of keys inserted + * + * @return Number of keys + */ + size_type constexpr size() const noexcept { return num_keys_; } + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::bv_read` + * + * @return Device ref of the current `trie` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + Allocator allocator_; ///< Allocator + size_type num_keys_; ///< Number of keys inserted into trie + size_type num_nodes_; ///< Number of internal nodes + std::vector last_key_; ///< Last key inserted into trie + + static constexpr LabelType root_label_ = sizeof(LabelType) == 1 ? ' ' : -1; ///< Sentinel value + + struct level; + size_type num_levels_; ///< Number of trie levels + std::vector levels_; ///< Host-side array of levels + level* d_levels_ptr_; ///< Device-side array of levels + + using bitset_type = typename detail::dynamic_bitset; ///< Bitset type + using bitset_ref = typename bitset_type::ref_type; ///< Bitset ref + /// Type of the allocator to (de)allocate bitset refs + using bitset_allocator_type = typename std::allocator_traits::rebind_alloc; + ///< refs to per-level louds bitsets + thrust::device_vector louds_refs_; + ///< refs to per-level outs bitsets + thrust::device_vector outs_refs_; + + bitset_ref* louds_refs_ptr_; ///< Raw device pointer to louds_refs_ + bitset_ref* outs_refs_ptr_; ///< Raw device pointer to outs_refs_ + + trie* device_ptr_; ///< Device-side copy of trie + + ///< Non-owning container ref type + template + using ref_type = cuco::experimental::trie_ref; + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; + + // Host bitset to buffer bit updates before bulk initializing dynamic_bitset on device + // TODO: This struct replicates code from dynamic_bitset. Remove these parts from dynamic_bitset? + struct host_bitset { + using word_type = typename bitset_type::word_type; + thrust::host_vector words_; + size_type n_bits_; + + host_bitset() noexcept : n_bits_{0} {} + void push_back(bool bit) noexcept + { + if (n_bits_ % bits_per_block == 0) { + words_.resize(words_.size() + words_per_block); // Extend storage by one block + } + + set(n_bits_++, bit); + } + void set(size_type index, bool bit) noexcept + { + size_type word_id = index / bits_per_word; + size_type bit_id = index % bits_per_word; + if (bit) { + words_[word_id] |= 1UL << bit_id; + } else { + words_[word_id] &= ~(1UL << bit_id); + } + } + void set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } + void clear() noexcept + { + words_.clear(); + n_bits_ = 0; + } + + private: + const size_type words_per_block = bitset_type::words_per_block; + const size_type bits_per_block = bitset_type::bits_per_block; + const size_type bits_per_word = bitset_type::bits_per_word; + }; + + /** + * @brief Struct to represent each trie level + */ + struct level { + level(); + level(level&&) = default; ///< Move constructor + + bitset_type louds_; ///< Indicates links to next and previous level + bitset_type outs_; ///< Indicates terminal nodes of valid keys + + host_bitset h_louds_; ///< Host buffer for louds + host_bitset h_outs_; ///< Host buffer for outs + + /// Type of the allocator to (de)allocate labels + using label_allocator_type = typename std::allocator_traits::rebind_alloc; + thrust::device_vector labels_; ///< Labels at this level + LabelType* labels_ptr_; ///< Raw device pointer to labels + + std::vector h_labels_; ///< Host copy of labels, using std::vector for performance + + size_type offset_; ///< Cumulative node count in parent levels + }; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/trie_ref.cuh b/include/cuco/trie_ref.cuh new file mode 100644 index 000000000..244d3adfb --- /dev/null +++ b/include/cuco/trie_ref.cuh @@ -0,0 +1,40 @@ +#pragma once + +#include + +namespace cuco { +namespace experimental { + +template +class trie; + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @tparam LabelType Trie label type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class trie_ref + : public detail::operator_impl>... { + public: + /** + * @brief Constructs trie_ref. + * + * @param trie Non-owning ref of trie + */ + __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; + + private: + const trie* trie_; + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3deeeddf1..62569500f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -34,12 +34,13 @@ endif() ################################################################################################### function(ConfigureTest TEST_NAME) add_executable(${TEST_NAME} ${ARGN}) - target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart) + target_link_options(${TEST_NAME} PRIVATE -fopenmp) + target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart Threads::Threads) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra - --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) + --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage -fopenmp) catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests) endfunction(ConfigureTest) @@ -100,8 +101,13 @@ ConfigureTest(STATIC_MULTIMAP_TEST ################################################################################################### # - dynamic_bitset tests -------------------------------------------------------------------------- ConfigureTest(DYNAMIC_BITSET_TEST - dynamic_bitset/find_next_test.cu - dynamic_bitset/get_test.cu - dynamic_bitset/rank_test.cu - dynamic_bitset/select_test.cu - dynamic_bitset/size_test.cu) + trie/dynamic_bitset/find_next_test.cu + trie/dynamic_bitset/get_test.cu + trie/dynamic_bitset/rank_test.cu + trie/dynamic_bitset/select_test.cu + trie/dynamic_bitset/size_test.cu) + +################################################################################################### +# - trie tests ------------------------------------------------------------------------------ +ConfigureTest(TRIE_TEST + trie/lookup_test.cu) diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/trie/dynamic_bitset/find_next_test.cu similarity index 100% rename from tests/dynamic_bitset/find_next_test.cu rename to tests/trie/dynamic_bitset/find_next_test.cu diff --git a/tests/dynamic_bitset/get_test.cu b/tests/trie/dynamic_bitset/get_test.cu similarity index 100% rename from tests/dynamic_bitset/get_test.cu rename to tests/trie/dynamic_bitset/get_test.cu diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/trie/dynamic_bitset/rank_test.cu similarity index 100% rename from tests/dynamic_bitset/rank_test.cu rename to tests/trie/dynamic_bitset/rank_test.cu diff --git a/tests/dynamic_bitset/select_test.cu b/tests/trie/dynamic_bitset/select_test.cu similarity index 100% rename from tests/dynamic_bitset/select_test.cu rename to tests/trie/dynamic_bitset/select_test.cu diff --git a/tests/dynamic_bitset/size_test.cu b/tests/trie/dynamic_bitset/size_test.cu similarity index 100% rename from tests/dynamic_bitset/size_test.cu rename to tests/trie/dynamic_bitset/size_test.cu diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu new file mode 100644 index 000000000..cdc38cd35 --- /dev/null +++ b/tests/trie/lookup_test.cu @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include + +#include + +using namespace cuco::utility; + +template +void trie_lookup_test() +{ + std::size_t num_keys = 64 * 1024; + std::size_t max_key_length = 6; + + thrust::host_vector labels; + thrust::host_vector offsets; + + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); + + cuco::experimental::trie trie; + for (auto key : keys) { + trie.insert(key.begin(), key.end()); + } + trie.build(); + + thrust::device_vector d_labels = labels; + thrust::device_vector d_offsets = offsets; + thrust::device_vector result(num_keys, -1lu); + + trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin()); + REQUIRE(cuco::test::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys))); +} + +TEST_CASE("Trie lookup", "") +{ + trie_lookup_test(); + trie_lookup_test(); +} diff --git a/tests/trie/utils.hpp b/tests/trie/utils.hpp new file mode 100644 index 000000000..7eae100af --- /dev/null +++ b/tests/trie/utils.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace cuco { +namespace test { +namespace trie { + +struct valid_key { + valid_key(size_t num_keys) : num_keys_(num_keys) {} + __host__ __device__ bool operator()(size_t x) const { return x < num_keys_; } + const size_t num_keys_; +}; + +template +void generate_labels(thrust::host_vector& labels, + thrust::host_vector& offsets, + size_t num_keys, + size_t max_key_length, + LengthsDist lengths_dist, + LabelsDist labels_dist) +{ + cuco::utility::key_generator gen; + + offsets.resize(num_keys); + gen.generate(lengths_dist, offsets.begin(), offsets.end()); + + for (auto& offset : offsets) { + offset = 1 + (offset % max_key_length); + } + + offsets.push_back(0); + thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); + + labels.resize(offsets.back()); + gen.generate(labels_dist, labels.begin(), labels.end()); +} + +template +std::vector> sorted_keys(thrust::host_vector& labels, + thrust::host_vector& offsets) +{ + std::vector> keys; + size_t num_keys = offsets.size() - 1; + for (size_t key_id = 0; key_id < num_keys; key_id++) { + std::vector cur_key; + for (size_t pos = offsets[key_id]; pos < offsets[key_id + 1]; pos++) { + cur_key.push_back(labels[pos]); + } + keys.push_back(cur_key); + } + sort(keys.begin(), keys.end()); + return keys; +} + +} // namespace trie +} // namespace test +} // namespace cuco