diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 3635336e8..3617426f7 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -83,3 +83,18 @@ ConfigureBench(DYNAMIC_MAP_BENCH
 # - hash function benchmarks ----------------------------------------------------------------------
 ConfigureBench(HASH_BENCH
   hash_bench.cu)
+
+###################################################################################################
+# - dynamic_bitset benchmarks -------------------------------------------------------------------------
+ConfigureBench(DYNAMIC_BITSET_BENCH
+  trie/dynamic_bitset/find_next_bench.cu
+  trie/dynamic_bitset/rank_bench.cu
+  trie/dynamic_bitset/select_bench.cu
+  trie/dynamic_bitset/size_bench.cu
+  trie/dynamic_bitset/test_bench.cu)
+
+###################################################################################################
+# - trie benchmarks -------------------------------------------------------------------------
+ConfigureBench(TRIE_BENCH
+  trie/insert_bench.cu
+  trie/lookup_bench.cu)
diff --git a/benchmarks/trie/dynamic_bitset/find_next_bench.cu b/benchmarks/trie/dynamic_bitset/find_next_bench.cu
new file mode 100644
index 000000000..c6d38745c
--- /dev/null
+++ b/benchmarks/trie/dynamic_bitset/find_next_bench.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/host_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::find_next` performance
+ */
+template <typename Dist>
+void dynamic_bitset_find_next(nvbench::state& state, nvbench::type_list<Dist>)
+{
+  auto const num_bits      = state.get_int64_or_default("NumInputs", defaults::N);
+  using word_type          = typename cuco::experimental::detail::dynamic_bitset<>::word_type;
+  auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word;
+  thrust::host_vector<word_type> keys((num_bits - 1) / bits_per_word + 1);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::detail::dynamic_bitset bitset;
+  bitset.insert(keys.begin(), keys.end(), num_bits);
+
+  const size_t query_size = min(1000 * 1000lu, num_bits / 10);
+  thrust::device_vector<size_t> inputs(query_size);
+  thrust::sequence(inputs.begin(), inputs.end(), 0);
+  thrust::device_vector<size_t> outputs(query_size);
+
+  state.add_element_count(query_size);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    bitset.find_next(inputs.begin(), inputs.end(), outputs.begin());
+  });
+}
+
+NVBENCH_BENCH_TYPES(dynamic_bitset_find_next,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<distribution::uniform>))
+  .set_name("dynamic_bitset_find_next")
+  .set_type_axes_names({"Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
diff --git a/benchmarks/trie/dynamic_bitset/rank_bench.cu b/benchmarks/trie/dynamic_bitset/rank_bench.cu
new file mode 100644
index 000000000..6b41cf029
--- /dev/null
+++ b/benchmarks/trie/dynamic_bitset/rank_bench.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/host_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::rank` performance
+ */
+template <typename Dist>
+void dynamic_bitset_rank(nvbench::state& state, nvbench::type_list<Dist>)
+{
+  auto const num_bits      = state.get_int64_or_default("NumInputs", defaults::N);
+  using word_type          = typename cuco::experimental::detail::dynamic_bitset<>::word_type;
+  auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word;
+  thrust::host_vector<word_type> keys((num_bits - 1) / bits_per_word + 1);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::detail::dynamic_bitset bitset;
+  bitset.insert(keys.begin(), keys.end(), num_bits);
+
+  const size_t query_size = min(1000 * 1000lu, num_bits / 10);
+  thrust::device_vector<size_t> inputs(query_size);
+  thrust::sequence(inputs.begin(), inputs.end(), 0);
+  thrust::device_vector<size_t> outputs(query_size);
+
+  state.add_element_count(query_size);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    bitset.rank(inputs.begin(), inputs.end(), outputs.begin());
+  });
+}
+
+NVBENCH_BENCH_TYPES(dynamic_bitset_rank,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<distribution::uniform>))
+  .set_name("dynamic_bitset_rank")
+  .set_type_axes_names({"Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
diff --git a/benchmarks/trie/dynamic_bitset/select_bench.cu b/benchmarks/trie/dynamic_bitset/select_bench.cu
new file mode 100644
index 000000000..755018ea3
--- /dev/null
+++ b/benchmarks/trie/dynamic_bitset/select_bench.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/host_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::select` performance
+ */
+template <typename Dist>
+void dynamic_bitset_select(nvbench::state& state, nvbench::type_list<Dist>)
+{
+  auto const num_bits      = state.get_int64_or_default("NumInputs", defaults::N);
+  using word_type          = typename cuco::experimental::detail::dynamic_bitset<>::word_type;
+  auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word;
+  thrust::host_vector<word_type> keys((num_bits - 1) / bits_per_word + 1);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::detail::dynamic_bitset bitset;
+  bitset.insert(keys.begin(), keys.end(), num_bits);
+
+  const size_t query_size = min(1000 * 1000lu, num_bits / 10);
+  thrust::device_vector<size_t> inputs(query_size);
+  thrust::sequence(inputs.begin(), inputs.end(), 0);
+  thrust::device_vector<size_t> outputs(query_size);
+
+  state.add_element_count(query_size);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    bitset.select(inputs.begin(), inputs.end(), outputs.begin());
+  });
+}
+
+NVBENCH_BENCH_TYPES(dynamic_bitset_select,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<distribution::uniform>))
+  .set_name("dynamic_bitset_select")
+  .set_type_axes_names({"Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
diff --git a/benchmarks/trie/dynamic_bitset/size_bench.cu b/benchmarks/trie/dynamic_bitset/size_bench.cu
new file mode 100644
index 000000000..d31ecebbd
--- /dev/null
+++ b/benchmarks/trie/dynamic_bitset/size_bench.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/host_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::size` performance
+ */
+template <typename Dist>
+void dynamic_bitset_size(nvbench::state& state, nvbench::type_list<Dist>)
+{
+  auto const num_bits      = state.get_int64_or_default("NumInputs", defaults::N);
+  using word_type          = typename cuco::experimental::detail::dynamic_bitset<>::word_type;
+  auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word;
+  thrust::host_vector<word_type> keys((num_bits - 1) / bits_per_word + 1);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::detail::dynamic_bitset bitset;
+  bitset.insert(keys.begin(), keys.end(), num_bits);
+
+  state.add_element_count(1);
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const size = bitset.size(); });
+}
+
+NVBENCH_BENCH_TYPES(dynamic_bitset_size,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<distribution::uniform>))
+  .set_name("dynamic_bitset_size")
+  .set_type_axes_names({"Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
diff --git a/benchmarks/trie/dynamic_bitset/test_bench.cu b/benchmarks/trie/dynamic_bitset/test_bench.cu
new file mode 100644
index 000000000..46109c146
--- /dev/null
+++ b/benchmarks/trie/dynamic_bitset/test_bench.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/host_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::test` performance
+ */
+template <typename Dist>
+void dynamic_bitset_test(nvbench::state& state, nvbench::type_list<Dist>)
+{
+  auto const num_bits      = state.get_int64_or_default("NumInputs", defaults::N);
+  using word_type          = typename cuco::experimental::detail::dynamic_bitset<>::word_type;
+  auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word;
+  thrust::host_vector<word_type> keys((num_bits - 1) / bits_per_word + 1);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::detail::dynamic_bitset bitset;
+  bitset.insert(keys.begin(), keys.end(), num_bits);
+
+  const size_t query_size = min(1000 * 1000lu, num_bits / 10);
+  thrust::device_vector<size_t> inputs(query_size);
+  thrust::sequence(inputs.begin(), inputs.end(), 0);
+  thrust::device_vector<size_t> outputs(query_size);
+
+  state.add_element_count(query_size);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    bitset.test(inputs.begin(), inputs.end(), outputs.begin());
+  });
+}
+
+NVBENCH_BENCH_TYPES(dynamic_bitset_test,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<distribution::uniform>))
+  .set_name("dynamic_bitset_test")
+  .set_type_axes_names({"Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu
new file mode 100644
index 000000000..90c243ed0
--- /dev/null
+++ b/benchmarks/trie/insert_bench.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <../tests/trie/utils.hpp>
+#include <cuco/trie.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::experimental::trie::insert` performance
+ */
+template <typename LabelType>
+void trie_insert(nvbench::state& state, nvbench::type_list<LabelType>)
+{
+  auto const num_keys       = state.get_int64_or_default("NumKeys", 100 * 1000);
+  auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10);
+
+  cuco::experimental::trie<LabelType> trie;
+
+  thrust::host_vector<LabelType> labels;
+  thrust::host_vector<size_t> offsets;
+
+  distribution::unique lengths_dist;
+  distribution::gaussian labels_dist{0.5};
+  cuco::test::trie::generate_labels(
+    labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist);
+  auto keys = cuco::test::trie::sorted_keys(labels, offsets);
+
+  state.add_element_count(num_keys);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    for (auto& key : keys) {
+      trie.insert(key.begin(), key.end());
+    }
+  });
+}
+
+NVBENCH_BENCH_TYPES(trie_insert, NVBENCH_TYPE_AXES(nvbench::type_list<char, int>))
+  .set_name("trie_insert")
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("MaxKeyLength", std::vector<nvbench::int64_t>{4, 8, 16})
+  .add_int64_axis("NumKeys", std::vector<nvbench::int64_t>{100 * 1000, 1000 * 1000});
diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu
new file mode 100644
index 000000000..55a30b168
--- /dev/null
+++ b/benchmarks/trie/lookup_bench.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <../tests/trie/utils.hpp>
+#include <cuco/trie.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::experimental::trie::lookup` performance
+ */
+template <typename LabelType>
+void trie_lookup(nvbench::state& state, nvbench::type_list<LabelType>)
+{
+  auto const num_keys       = state.get_int64_or_default("NumKeys", 100 * 1000);
+  auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10);
+
+  cuco::experimental::trie<LabelType> trie;
+
+  thrust::host_vector<LabelType> labels;
+  thrust::host_vector<size_t> offsets;
+
+  distribution::unique lengths_dist;
+  distribution::gaussian labels_dist{0.5};
+  cuco::test::trie::generate_labels(
+    labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist);
+  auto keys = cuco::test::trie::sorted_keys(labels, offsets);
+
+  for (auto key : keys) {
+    trie.insert(key.begin(), key.end());
+  }
+  trie.build();
+
+  const size_t query_size = min(1000 * 1000lu, num_keys / 10);
+  thrust::device_vector<LabelType> inputs(labels.begin(), labels.begin() + offsets[query_size]);
+  thrust::device_vector<size_t> d_offsets(offsets.begin(), offsets.begin() + query_size);
+  thrust::device_vector<size_t> outputs(query_size);
+
+  state.add_element_count(query_size);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    trie.lookup(inputs.begin(), d_offsets.begin(), d_offsets.end(), outputs.begin());
+  });
+}
+
+NVBENCH_BENCH_TYPES(trie_lookup, NVBENCH_TYPE_AXES(nvbench::type_list<char, int>))
+  .set_name("trie_lookup")
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("MaxKeyLength", std::vector<nvbench::int64_t>{4, 8, 16})
+  .add_int64_axis("NumKeys", std::vector<nvbench::int64_t>{100 * 1000, 1000 * 1000});
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d78627eee..5b0ff128e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -40,3 +40,5 @@ ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
 ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
 ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
+ConfigureExample(TRIE_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/trie/host_bulk_example.cu")
+ConfigureExample(TRIE_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/trie/device_ref_example.cu")
diff --git a/examples/trie/device_ref_example.cu b/examples/trie/device_ref_example.cu
new file mode 100644
index 000000000..d26e0524c
--- /dev/null
+++ b/examples/trie/device_ref_example.cu
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <../tests/trie/utils.hpp>
+#include <cuco/trie.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/logical.h>
+#include <thrust/scan.h>
+
+using namespace cuco::utility;
+
+template <typename TrieRef, typename LabelIt, typename OffsetIt, typename OutputIt>
+__global__ void lookup_kernel(
+  TrieRef ref, LabelIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys)
+{
+  auto key_id            = cuco::detail::global_thread_id();
+  auto const loop_stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    auto key_start_pos = keys + offsets[key_id];
+    auto key_length    = offsets[key_id + 1] - offsets[key_id];
+
+    outputs[key_id] = ref.lookup(key_start_pos, key_length);
+    key_id += loop_stride;
+  }
+}
+
+/**
+ * @file device_ref_example.cu
+ * @brief Demonstrates usage of the trie device-side APIs.
+ *
+ * trie provides a non-owning reference which can be used to interact with
+ * the container from within device code.
+ *
+ */
+int main(void)
+{
+  using LabelType = int;
+
+  std::size_t num_keys       = 64 * 1024;
+  std::size_t max_key_length = 6;
+
+  thrust::host_vector<LabelType> labels;
+  thrust::host_vector<size_t> offsets;
+
+  distribution::unique lengths_dist;
+  distribution::gaussian labels_dist{0.5};
+
+  cuco::test::trie::generate_labels(
+    labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist);
+  auto keys = cuco::test::trie::sorted_keys(labels, offsets);
+
+  cuco::experimental::trie<LabelType> trie;
+  for (auto key : keys) {
+    trie.insert(key.begin(), key.end());
+  }
+  trie.build();
+
+  thrust::device_vector<LabelType> d_labels = labels;
+  thrust::device_vector<size_t> d_offsets   = offsets;
+  thrust::device_vector<size_t> result(num_keys, -1lu);
+
+  trie_lookup_kernel<<<128, 128>>>(trie.ref(cuco::experimental::trie_lookup),
+                                   d_labels.begin(),
+                                   d_offsets.begin(),
+                                   result.begin(),
+                                   num_keys);
+
+  bool const all_keys_found =
+    thrust::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys));
+  if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }
+
+  return 0;
+}
diff --git a/examples/trie/host_bulk_example.cu b/examples/trie/host_bulk_example.cu
new file mode 100644
index 000000000..6cf6ac7ef
--- /dev/null
+++ b/examples/trie/host_bulk_example.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <../tests/trie/utils.hpp>
+#include <cuco/trie.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/logical.h>
+#include <thrust/scan.h>
+
+using namespace cuco::utility;
+
+/**
+ * @file host_bulk_example.cu
+ * @brief Demonstrates usage of the trie "bulk" host APIs.
+ *
+ * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or
+ * `lookup` on a set of keys.
+ *
+ */
+int main(void)
+{
+  using LabelType = int;
+
+  std::size_t num_keys       = 64 * 1024;
+  std::size_t max_key_length = 6;
+
+  thrust::host_vector<LabelType> labels;
+  thrust::host_vector<size_t> offsets;
+
+  distribution::unique lengths_dist;
+  distribution::gaussian labels_dist{0.5};
+
+  cuco::test::trie::generate_labels(
+    labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist);
+  auto keys = cuco::test::trie::sorted_keys(labels, offsets);
+
+  cuco::experimental::trie<LabelType> trie;
+  for (auto key : keys) {
+    trie.insert(key.begin(), key.end());
+  }
+  trie.build();
+
+  thrust::device_vector<LabelType> d_labels = labels;
+  thrust::device_vector<size_t> d_offsets   = offsets;
+  thrust::device_vector<size_t> result(num_keys, -1lu);
+
+  trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin());
+
+  bool const all_keys_found =
+    thrust::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys));
+  if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }
+
+  return 0;
+}
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
index 8383669fc..06d845045 100644
--- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
@@ -102,6 +102,16 @@ class dynamic_bitset {
    */
   constexpr dynamic_bitset(Allocator const& allocator = Allocator{});
 
+  /**
+   * @brief Inserts words in the range [word_begin, word_end)
+   *
+   * @param words_begin Begin iterator to words list
+   * @param words_end End iterator to words list
+   * @param n_bits Number of bits to be inserted
+   */
+  template <typename WordIt>
+  constexpr void insert(WordIt words_begin, WordIt words_end, size_type n_bits);
+
   /**
    * @brief Appends the given element `value` to the end of the bitset
    *
@@ -146,6 +156,26 @@ class dynamic_bitset {
                       OutputIt outputs_begin,
                       cuda_stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores
+   * position of first set bit including or after position `keys_begin[i]`, to `output_begin[i]`.
+   *
+   * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+   * `size_type`
+   *
+   * @param keys_begin Begin iterator to list of positions to be queried
+   * @param keys_end End iterator to positions list
+   * @param outputs_begin Begin iterator to outputs of find_next operation
+   * @param stream Stream to execute find_next kernel
+   */
+  template <typename KeyIt, typename OutputIt>
+  constexpr void find_next(KeyIt keys_begin,
+                           KeyIt keys_end,
+                           OutputIt outputs_begin,
+                           cuda_stream_ref stream = {}) noexcept;
+
   /**
    * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total
    * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`.
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
index d56ef9d7c..34d6c9a70 100644
--- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
@@ -44,6 +44,21 @@ constexpr dynamic_bitset<Allocator>::dynamic_bitset(Allocator const& allocator)
 {
 }
 
+template <class Allocator>
+template <typename WordIt>
+constexpr void dynamic_bitset<Allocator>::insert(WordIt words_begin,
+                                                 WordIt words_end,
+                                                 size_type n_bits)
+{
+  if (n_bits == 0) { return; }
+  size_t num_blocks = (n_bits - 1) / bits_per_block + 1;
+  assert(num_blocks == cuco::detail::distance(words_begin, words_end));
+
+  words_.reserve(num_blocks);
+  words_.insert(words_.end(), words_begin, words_end);
+  n_bits_ = n_bits;
+}
+
 template <class Allocator>
 constexpr void dynamic_bitset<Allocator>::push_back(bool bit) noexcept
 {
@@ -82,6 +97,7 @@ constexpr void dynamic_bitset<Allocator>::test(KeyIt keys_begin,
 
 {
   build();
+  if (n_bits_ == 0) { return; }
   auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
   if (num_keys == 0) { return; }
 
@@ -91,6 +107,25 @@ constexpr void dynamic_bitset<Allocator>::test(KeyIt keys_begin,
     ref(), keys_begin, outputs_begin, num_keys);
 }
 
+template <class Allocator>
+template <typename KeyIt, typename OutputIt>
+constexpr void dynamic_bitset<Allocator>::find_next(KeyIt keys_begin,
+                                                    KeyIt keys_end,
+                                                    OutputIt outputs_begin,
+                                                    cuda_stream_ref stream) noexcept
+
+{
+  build();
+  if (n_bits_ == 0) { return; }
+  auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys);
+
+  bitset_find_next_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref(), keys_begin, outputs_begin, num_keys);
+}
+
 template <class Allocator>
 template <typename KeyIt, typename OutputIt>
 constexpr void dynamic_bitset<Allocator>::rank(KeyIt keys_begin,
diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
index c92ab60b2..b67ba96ee 100644
--- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
+++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
@@ -55,6 +55,35 @@ __global__ void bitset_test_kernel(BitsetRef ref,
   }
 }
 
+/*
+ * @brief Compute position of next set bit for a range of keys
+ *
+ * @tparam BitsetRef Bitset reference type
+ * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+ * `size_type`
+ * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+ * `size_type`
+ *
+ * @param ref Bitset ref
+ * @param keys Begin iterator to keys
+ * @param outputs Begin iterator to outputs
+ * @param num_keys Number of input keys
+ */
+template <typename BitsetRef, typename KeyIt, typename OutputIt>
+__global__ void bitset_find_next_kernel(BitsetRef ref,
+                                        KeyIt keys,
+                                        OutputIt outputs,
+                                        cuco::detail::index_type num_keys)
+{
+  auto key_id       = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    outputs[key_id] = ref.find_next(keys[key_id]);
+    key_id += stride;
+  }
+}
+
 /*
  * @brief Gather rank values for a range of keys
  *
diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl
new file mode 100644
index 000000000..ca202d236
--- /dev/null
+++ b/include/cuco/detail/trie/trie.inl
@@ -0,0 +1,215 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cuco {
+namespace experimental {
+
+template <typename LabelType, class Allocator>
+constexpr trie<LabelType, Allocator>::trie(Allocator const& allocator)
+  : allocator_{allocator},
+    num_keys_{0},
+    num_nodes_{1},
+    last_key_{},
+    num_levels_{2},
+    levels_{2},
+    d_levels_ptr_{nullptr},
+    device_ptr_{nullptr}
+{
+  levels_[0].h_louds_.push_back(0);
+  levels_[0].h_louds_.push_back(1);
+  levels_[1].h_louds_.push_back(1);
+  levels_[0].h_outs_.push_back(0);
+  levels_[0].h_labels_.push_back(root_label_);
+}
+
+template <typename LabelType, class Allocator>
+trie<LabelType, Allocator>::~trie() noexcept(false)
+{
+  if (d_levels_ptr_) { CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); }
+  if (device_ptr_) { CUCO_CUDA_TRY(cudaFree(device_ptr_)); }
+}
+
+template <typename LabelType, class Allocator>
+template <typename LabelIt>
+void trie<LabelType, Allocator>::insert(LabelIt labels_begin, LabelIt labels_end) noexcept
+{
+  size_t key_length = std::distance(labels_begin, labels_end);
+
+  bool same_as_last_key = key_length == last_key_.size();
+  for (size_t pos = 0; same_as_last_key && pos < last_key_.size(); pos++) {
+    if (labels_begin[pos] != last_key_[pos]) { same_as_last_key = false; }
+  }
+  if (same_as_last_key) { return; }  // Ignore duplicate keys
+  // assert(num_keys_ == 0 || key > last_key_);  // Keys are expected to be inserted in sorted order
+
+  if (key_length == 0) {
+    levels_[0].h_outs_.set(0, 1);
+    ++levels_[1].offset_;
+    ++num_keys_;
+    return;
+  }
+
+  if (key_length + 1 >= levels_.size()) { levels_.resize(key_length + 2); }
+
+  // Find first position where label is different from last_key
+  // Trie is not updated till that position is reached, simply skip to next position
+  size_type pos = 0;
+  for (; pos < key_length; ++pos) {
+    auto& level = levels_[pos + 1];
+    auto label  = labels_begin[pos];
+
+    if (pos == last_key_.size() || label != level.h_labels_.back()) {
+      level.h_louds_.set_last(0);
+      level.h_louds_.push_back(1);
+      level.h_outs_.push_back(0);
+      level.h_labels_.push_back(label);
+      ++num_nodes_;
+      break;
+    }
+  }
+
+  // Process remaining labels after divergence point from last_key
+  // Each such label will create a new edge and node pair
+  for (++pos; pos < key_length; ++pos) {
+    auto& level = levels_[pos + 1];
+    level.h_louds_.push_back(0);
+    level.h_louds_.push_back(1);
+    level.h_outs_.push_back(0);
+    level.h_labels_.push_back(labels_begin[pos]);
+    ++num_nodes_;
+  }
+
+  levels_[key_length + 1].h_louds_.push_back(1);  // Mark end of current key
+  ++levels_[key_length + 1].offset_;
+  levels_[key_length].h_outs_.set_last(1);  // Set terminal bit indicating valid path
+
+  ++num_keys_;
+
+  last_key_.resize(key_length);
+  for (size_t pos = 0; pos < key_length; pos++) {
+    last_key_[pos] = labels_begin[pos];
+  }
+}
+
+template <typename LabelType, class Allocator>
+void trie<LabelType, Allocator>::build() noexcept(false)
+{
+  // Perform build level-by-level for all levels, followed by a deep-copy from host to device
+  size_type offset = 0;
+
+  thrust::device_vector<size_type> test_keys(1, 0);
+  thrust::device_vector<bool> test_results(1);
+
+  for (auto& level : levels_) {
+    level.louds_.insert(
+      level.h_louds_.words_.begin(), level.h_louds_.words_.end(), level.h_louds_.n_bits_);
+    level.h_louds_.clear();
+
+    // Run host-bulk test on bitvectors to initiate internal build()
+    level.louds_.test(test_keys.begin(), test_keys.end(), test_results.begin());
+    louds_refs_.push_back(level.louds_.ref());
+
+    level.outs_.insert(
+      level.h_outs_.words_.begin(), level.h_outs_.words_.end(), level.h_outs_.n_bits_);
+    level.h_outs_.clear();
+
+    level.outs_.test(test_keys.begin(), test_keys.end(), test_results.begin());
+    outs_refs_.push_back(level.outs_.ref());
+
+    level.labels_ = level.h_labels_;
+    level.h_labels_.clear();
+    level.labels_ptr_ = thrust::raw_pointer_cast(level.labels_.data());
+
+    offset += level.offset_;
+    level.offset_ = offset;
+  }
+
+  louds_refs_ptr_ = thrust::raw_pointer_cast(louds_refs_.data());
+  outs_refs_ptr_  = thrust::raw_pointer_cast(outs_refs_.data());
+
+  num_levels_ = levels_.size();
+
+  // Move levels to device
+  CUCO_CUDA_TRY(cudaMalloc(&d_levels_ptr_, sizeof(level) * num_levels_));
+  CUCO_CUDA_TRY(
+    cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(level) * num_levels_, cudaMemcpyHostToDevice));
+
+  // Finally create a device copy of full trie structure
+  CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie<LabelType, Allocator>)));
+  CUCO_CUDA_TRY(
+    cudaMemcpy(device_ptr_, this, sizeof(trie<LabelType, Allocator>), cudaMemcpyHostToDevice));
+}
+
+template <typename LabelType, class Allocator>
+template <typename LabelIt, typename OffsetIt, typename OutputIt>
+void trie<LabelType, Allocator>::lookup(LabelIt labels_begin,
+                                        OffsetIt offsets_begin,
+                                        OffsetIt offsets_end,
+                                        OutputIt outputs_begin,
+                                        cuda_stream_ref stream) const noexcept
+{
+  auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1;
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = min(128lu, cuco::detail::grid_size(num_keys));
+  auto ref_            = this->ref(cuco::experimental::trie_lookup);
+
+  trie_lookup_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref_, labels_begin, offsets_begin, outputs_begin, num_keys);
+}
+
+template <typename TrieRef, typename LabelIt, typename OffsetIt, typename OutputIt>
+__global__ void trie_lookup_kernel(
+  TrieRef ref, LabelIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys)
+{
+  auto key_id            = cuco::detail::global_thread_id();
+  auto const loop_stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    auto key_start_pos = keys + offsets[key_id];
+    auto key_length    = offsets[key_id + 1] - offsets[key_id];
+
+    outputs[key_id] = ref.lookup(key_start_pos, key_length);
+    key_id += loop_stride;
+  }
+}
+
+template <typename LabelType, class Allocator>
+template <typename... Operators>
+auto trie<LabelType, Allocator>::ref(Operators...) const noexcept
+{
+  static_assert(sizeof...(Operators), "No operators specified");
+  return ref_type<Operators...>{device_ptr_};
+}
+
+template <typename LabelType, class Allocator>
+trie<LabelType, Allocator>::level::level()
+  : h_louds_{},
+    h_outs_{},
+    louds_{},
+    outs_{},
+    labels_{},
+    labels_ptr_{nullptr},
+    h_labels_{},
+    offset_{0}
+{
+}
+
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl
new file mode 100644
index 000000000..d7cb9427a
--- /dev/null
+++ b/include/cuco/detail/trie/trie_ref.inl
@@ -0,0 +1,117 @@
+namespace cuco {
+namespace experimental {
+
+template <typename LabelType, class Allocator, typename... Operators>
+__host__ __device__ constexpr trie_ref<LabelType, Allocator, Operators...>::trie_ref(
+  const trie<LabelType, Allocator>* trie) noexcept
+  : trie_{trie}
+{
+}
+
+namespace detail {
+
+template <typename LabelType, class Allocator, typename... Operators>
+class operator_impl<op::trie_lookup_tag, trie_ref<LabelType, Allocator, Operators...>> {
+  using ref_type  = trie_ref<LabelType, Allocator, Operators...>;
+  using size_type = size_t;
+
+ public:
+  /**
+   * @brief Lookup a single key
+   *
+   * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's
+   * `LabelType`
+   *
+   * @param labels Iterator to first label of key
+   * @param length Number of labels in key
+   *
+   * @return Index of key if it exists in trie, -1 otherwise
+   */
+  template <typename LabelIt>
+  [[nodiscard]] __device__ size_type lookup(LabelIt labels, size_type length) const noexcept
+  {
+    auto const& trie = static_cast<ref_type const&>(*this).trie_;
+
+    // Level-by-level search. node_id is updated at each level
+    size_type node_id = 0;
+    for (size_type cur_depth = 1; cur_depth <= length; cur_depth++) {
+      if (!search_label_in_children(labels[cur_depth - 1], node_id, cur_depth)) { return -1lu; }
+    }
+
+    // Check for terminal node bit that indicates a valid key
+    size_type leaf_level_id = length;
+    if (!trie->outs_refs_ptr_[leaf_level_id].test(node_id)) { return -1lu; }
+
+    // Key exists in trie, generate the index
+    auto offset = trie->d_levels_ptr_[leaf_level_id].offset_;
+    auto rank   = trie->outs_refs_ptr_[leaf_level_id].rank(node_id);
+
+    return offset + rank;
+  }
+
+ private:
+  /**
+   * @brief Find position of last child of a node
+   *
+   * @tparam BitsetRef Device-accessible reference to bitset
+   *
+   * @param louds louds bitset of current level
+   * @param node_id node index in current level
+   *
+   * @return Position of last child
+   */
+  template <typename BitsetRef>
+  [[nodiscard]] __device__ size_type last_child_position(BitsetRef louds,
+                                                         size_type& node_id) const noexcept
+  {
+    size_type node_pos = 0;
+    if (node_id != 0) {
+      node_pos = louds.select(node_id - 1) + 1;
+      node_id  = node_pos - node_id;
+    }
+
+    auto pos_end = louds.find_next(node_pos);
+    return node_id + (pos_end - node_pos);
+  }
+
+  /**
+   * @brief Search for a target label in children nodes of a parent node
+   *
+   * @param target Label to search for
+   * @param node_id Index of parent node
+   * @param level_id Index of current level
+   *
+   * @return Boolean indicating success of search process
+   */
+  [[nodiscard]] __device__ bool search_label_in_children(LabelType target,
+                                                         size_type& node_id,
+                                                         size_type level_id) const noexcept
+  {
+    auto const& trie = static_cast<ref_type const&>(*this).trie_;
+    auto louds       = trie->louds_refs_ptr_[level_id];
+
+    auto end   = last_child_position(louds, node_id);  // Position of last child
+    auto begin = node_id;  // Position of first child, initialized after find_last_child call
+
+    auto& level = trie->d_levels_ptr_[level_id];
+    auto labels = level.labels_ptr_;
+
+    // Binary search labels array of current level
+    while (begin < end) {
+      node_id    = (begin + end) / 2;
+      auto label = labels[node_id];
+      if (target < label) {
+        end = node_id;
+      } else if (target > label) {
+        begin = node_id + 1;
+      } else {
+        break;
+      }
+    }
+    return begin < end;
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
index 77cf2c133..ca2953f40 100644
--- a/include/cuco/operator.hpp
+++ b/include/cuco/operator.hpp
@@ -51,6 +51,12 @@ struct contains_tag {
 struct find_tag {
 } inline constexpr find;
 
+/**
+ * @brief `trie_lookup` operator tag
+ */
+struct trie_lookup_tag {
+} inline constexpr trie_lookup;
+
 }  // namespace op
 }  // namespace experimental
 }  // namespace cuco
diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh
new file mode 100644
index 000000000..6b1837fde
--- /dev/null
+++ b/include/cuco/trie.cuh
@@ -0,0 +1,208 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <cuco/trie_ref.cuh>
+#include <thrust/host_vector.h>
+
+namespace cuco {
+namespace experimental {
+
+/**
+ * @brief Trie class
+ *
+ * @tparam label_type type of individual elements of vector keys (eg. char or int)
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <typename LabelType, class Allocator = thrust::device_malloc_allocator<std::byte>>
+class trie {
+ public:
+  /**
+   * @brief Constructs an empty trie
+   *
+   * @param allocator Allocator used for allocating device storage
+   */
+  constexpr trie(Allocator const& allocator = Allocator{});
+  ~trie() noexcept(false);
+
+  /**
+   * @brief Insert a single key into trie
+   *
+   * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's
+   * `LabelType`
+   * @param labels_begin Begin iterator to list of labels of input key
+   * @param labels_end End iterator to list of labels of input key
+   */
+  template <typename LabelIt>
+  void insert(LabelIt labels_begin, LabelIt labels_end) noexcept;
+
+  /**
+   * @brief Build level-by-level trie indexes after inserting all keys
+   *
+   * In addition, a snapshot of current trie state is copied to device
+   */
+  void build() noexcept(false);
+
+  /**
+   * @brief For every pair (`offsets_begin[i]`, `offsets_begin[i + 1]`) in the range
+   * `[offsets_begin, offsets_end)`, checks if the key defined by labels in the range
+   * [`labels_begin[offsets_begin[i]]`, `labels_begin[offsets_begin[i + 1]]`) is present in trie.
+   * Stores the index of key if it exists in trie (-1 otherwise) in `outputs_begin[i]`
+   *
+   * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's
+   * `LabelType`
+   * @tparam OffsetIt Device-accessible iterator whose `value_type` can be converted to trie's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean
+   * type
+   *
+   * @param labels_begin Begin iterator to labels list of all keys
+   * @param offsets_begin Begin iterator to offsets of key boundaries
+   * @param offsets_end End iterator to offsets
+   * @param outputs_begin Begin iterator to lookup results
+   * @param stream Stream to execute lookup kernel
+   */
+  template <typename LabelIt, typename OffsetIt, typename OutputIt>
+  void lookup(LabelIt labels_begin,
+              OffsetIt offsets_begin,
+              OffsetIt offsets_end,
+              OutputIt outputs_begin,
+              cuda_stream_ref stream = {}) const noexcept;
+
+  using size_type = std::size_t;  ///< size type
+
+  /**
+   * @brief Get current size i.e. number of keys inserted
+   *
+   * @return Number of keys
+   */
+  size_type constexpr size() const noexcept { return num_keys_; }
+
+  /**
+   * @brief Get device ref with operators.
+   *
+   * @tparam Operators Set of `cuco::op` to be provided by the ref
+   *
+   * @param ops List of operators, e.g., `cuco::bv_read`
+   *
+   * @return Device ref of the current `trie` object
+   */
+  template <typename... Operators>
+  [[nodiscard]] auto ref(Operators... ops) const noexcept;
+
+ private:
+  Allocator allocator_;              ///< Allocator
+  size_type num_keys_;               ///< Number of keys inserted into trie
+  size_type num_nodes_;              ///< Number of internal nodes
+  std::vector<LabelType> last_key_;  ///< Last key inserted into trie
+
+  static constexpr LabelType root_label_ = sizeof(LabelType) == 1 ? ' ' : -1;  ///< Sentinel value
+
+  struct level;
+  size_type num_levels_;       ///< Number of trie levels
+  std::vector<level> levels_;  ///< Host-side array of levels
+  level* d_levels_ptr_;        ///< Device-side array of levels
+
+  using bitset_type = typename detail::dynamic_bitset<Allocator>;  ///< Bitset type
+  using bitset_ref  = typename bitset_type::ref_type;              ///< Bitset ref
+  /// Type of the allocator to (de)allocate bitset refs
+  using bitset_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<bitset_ref>;
+  ///< refs to per-level louds bitsets
+  thrust::device_vector<bitset_ref, bitset_allocator_type> louds_refs_;
+  ///< refs to per-level outs bitsets
+  thrust::device_vector<bitset_ref, bitset_allocator_type> outs_refs_;
+
+  bitset_ref* louds_refs_ptr_;  ///< Raw device pointer to louds_refs_
+  bitset_ref* outs_refs_ptr_;   ///< Raw device pointer to outs_refs_
+
+  trie<LabelType, Allocator>* device_ptr_;  ///< Device-side copy of trie
+
+  ///< Non-owning container ref type
+  template <typename... Operators>
+  using ref_type = cuco::experimental::trie_ref<LabelType, Allocator, Operators...>;
+
+  // Mixins need to be friends with this class in order to access private members
+  template <typename Op, typename Ref>
+  friend class detail::operator_impl;
+
+  // Host bitset to buffer bit updates before bulk initializing dynamic_bitset on device
+  // TODO: This struct replicates code from dynamic_bitset. Remove these parts from dynamic_bitset?
+  struct host_bitset {
+    using word_type = typename bitset_type::word_type;
+    thrust::host_vector<word_type> words_;
+    size_type n_bits_;
+
+    host_bitset() noexcept : n_bits_{0} {}
+    void push_back(bool bit) noexcept
+    {
+      if (n_bits_ % bits_per_block == 0) {
+        words_.resize(words_.size() + words_per_block);  // Extend storage by one block
+      }
+
+      set(n_bits_++, bit);
+    }
+    void set(size_type index, bool bit) noexcept
+    {
+      size_type word_id = index / bits_per_word;
+      size_type bit_id  = index % bits_per_word;
+      if (bit) {
+        words_[word_id] |= 1UL << bit_id;
+      } else {
+        words_[word_id] &= ~(1UL << bit_id);
+      }
+    }
+    void set_last(bool bit) noexcept { set(n_bits_ - 1, bit); }
+    void clear() noexcept
+    {
+      words_.clear();
+      n_bits_ = 0;
+    }
+
+   private:
+    const size_type words_per_block = bitset_type::words_per_block;
+    const size_type bits_per_block  = bitset_type::bits_per_block;
+    const size_type bits_per_word   = bitset_type::bits_per_word;
+  };
+
+  /**
+   * @brief Struct to represent each trie level
+   */
+  struct level {
+    level();
+    level(level&&) = default;  ///< Move constructor
+
+    bitset_type louds_;  ///< Indicates links to next and previous level
+    bitset_type outs_;   ///< Indicates terminal nodes of valid keys
+
+    host_bitset h_louds_;  ///< Host buffer for louds
+    host_bitset h_outs_;   ///< Host buffer for outs
+
+    /// Type of the allocator to (de)allocate labels
+    using label_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<LabelType>;
+    thrust::device_vector<LabelType, label_allocator_type> labels_;  ///< Labels at this level
+    LabelType* labels_ptr_;  ///< Raw device pointer to labels
+
+    std::vector<LabelType> h_labels_;  ///< Host copy of labels, using std::vector for performance
+
+    size_type offset_;  ///< Cumulative node count in parent levels
+  };
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/trie/trie.inl>
diff --git a/include/cuco/trie_ref.cuh b/include/cuco/trie_ref.cuh
new file mode 100644
index 000000000..244d3adfb
--- /dev/null
+++ b/include/cuco/trie_ref.cuh
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cuco/operator.hpp>
+
+namespace cuco {
+namespace experimental {
+
+template <typename LabelType, class Allocator>
+class trie;
+
+/**
+ * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary
+ * operations defined in `include/cuco/operator.hpp`
+ *
+ * @tparam LabelType Trie label type
+ * @tparam Operators Device operator options defined in `include/cuco/operator.hpp`
+ */
+template <typename LabelType, class Allocator, typename... Operators>
+class trie_ref
+  : public detail::operator_impl<Operators, trie_ref<LabelType, Allocator, Operators...>>... {
+ public:
+  /**
+   * @brief Constructs trie_ref.
+   *
+   * @param trie Non-owning ref of trie
+   */
+  __host__ __device__ explicit constexpr trie_ref(const trie<LabelType, Allocator>* trie) noexcept;
+
+ private:
+  const trie<LabelType, Allocator>* trie_;
+
+  // Mixins need to be friends with this class in order to access private members
+  template <typename Op, typename Ref>
+  friend class detail::operator_impl;
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/trie/trie_ref.inl>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3deeeddf1..62569500f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -34,12 +34,13 @@ endif()
 ###################################################################################################
 function(ConfigureTest TEST_NAME)
     add_executable(${TEST_NAME} ${ARGN})
-    target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart)
+    target_link_options(${TEST_NAME} PRIVATE -fopenmp)
+    target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart Threads::Threads)
     target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
     set_target_properties(${TEST_NAME} PROPERTIES
                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
     target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
-      --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
+      --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage -fopenmp)
     catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests)
 endfunction(ConfigureTest)
 
@@ -100,8 +101,13 @@ ConfigureTest(STATIC_MULTIMAP_TEST
 ###################################################################################################
 # - dynamic_bitset tests --------------------------------------------------------------------------
 ConfigureTest(DYNAMIC_BITSET_TEST
-    dynamic_bitset/find_next_test.cu
-    dynamic_bitset/get_test.cu
-    dynamic_bitset/rank_test.cu
-    dynamic_bitset/select_test.cu
-    dynamic_bitset/size_test.cu)
+    trie/dynamic_bitset/find_next_test.cu
+    trie/dynamic_bitset/get_test.cu
+    trie/dynamic_bitset/rank_test.cu
+    trie/dynamic_bitset/select_test.cu
+    trie/dynamic_bitset/size_test.cu)
+
+###################################################################################################
+# - trie tests ------------------------------------------------------------------------------
+ConfigureTest(TRIE_TEST
+    trie/lookup_test.cu)
diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/trie/dynamic_bitset/find_next_test.cu
similarity index 100%
rename from tests/dynamic_bitset/find_next_test.cu
rename to tests/trie/dynamic_bitset/find_next_test.cu
diff --git a/tests/dynamic_bitset/get_test.cu b/tests/trie/dynamic_bitset/get_test.cu
similarity index 100%
rename from tests/dynamic_bitset/get_test.cu
rename to tests/trie/dynamic_bitset/get_test.cu
diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/trie/dynamic_bitset/rank_test.cu
similarity index 100%
rename from tests/dynamic_bitset/rank_test.cu
rename to tests/trie/dynamic_bitset/rank_test.cu
diff --git a/tests/dynamic_bitset/select_test.cu b/tests/trie/dynamic_bitset/select_test.cu
similarity index 100%
rename from tests/dynamic_bitset/select_test.cu
rename to tests/trie/dynamic_bitset/select_test.cu
diff --git a/tests/dynamic_bitset/size_test.cu b/tests/trie/dynamic_bitset/size_test.cu
similarity index 100%
rename from tests/dynamic_bitset/size_test.cu
rename to tests/trie/dynamic_bitset/size_test.cu
diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu
new file mode 100644
index 000000000..cdc38cd35
--- /dev/null
+++ b/tests/trie/lookup_test.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <trie/utils.hpp>
+#include <utils.hpp>
+
+#include <cuco/trie.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+using namespace cuco::utility;
+
+template <typename LabelType>
+void trie_lookup_test()
+{
+  std::size_t num_keys       = 64 * 1024;
+  std::size_t max_key_length = 6;
+
+  thrust::host_vector<LabelType> labels;
+  thrust::host_vector<size_t> offsets;
+
+  distribution::unique lengths_dist;
+  distribution::gaussian labels_dist{0.5};
+  cuco::test::trie::generate_labels(
+    labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist);
+  auto keys = cuco::test::trie::sorted_keys(labels, offsets);
+
+  cuco::experimental::trie<LabelType> trie;
+  for (auto key : keys) {
+    trie.insert(key.begin(), key.end());
+  }
+  trie.build();
+
+  thrust::device_vector<LabelType> d_labels = labels;
+  thrust::device_vector<size_t> d_offsets   = offsets;
+  thrust::device_vector<size_t> result(num_keys, -1lu);
+
+  trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin());
+  REQUIRE(cuco::test::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys)));
+}
+
+TEST_CASE("Trie lookup", "")
+{
+  trie_lookup_test<int>();
+  trie_lookup_test<char>();
+}
diff --git a/tests/trie/utils.hpp b/tests/trie/utils.hpp
new file mode 100644
index 000000000..7eae100af
--- /dev/null
+++ b/tests/trie/utils.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <chrono>
+#include <cuco/utility/key_generator.hpp>
+#include <fstream>
+#include <sstream>
+#include <thrust/host_vector.h>
+
+namespace cuco {
+namespace test {
+namespace trie {
+
+struct valid_key {
+  valid_key(size_t num_keys) : num_keys_(num_keys) {}
+  __host__ __device__ bool operator()(size_t x) const { return x < num_keys_; }
+  const size_t num_keys_;
+};
+
+template <typename LabelType, typename LengthsDist, typename LabelsDist>
+void generate_labels(thrust::host_vector<LabelType>& labels,
+                     thrust::host_vector<size_t>& offsets,
+                     size_t num_keys,
+                     size_t max_key_length,
+                     LengthsDist lengths_dist,
+                     LabelsDist labels_dist)
+{
+  cuco::utility::key_generator gen;
+
+  offsets.resize(num_keys);
+  gen.generate(lengths_dist, offsets.begin(), offsets.end());
+
+  for (auto& offset : offsets) {
+    offset = 1 + (offset % max_key_length);
+  }
+
+  offsets.push_back(0);
+  thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin());
+
+  labels.resize(offsets.back());
+  gen.generate(labels_dist, labels.begin(), labels.end());
+}
+
+template <typename LabelType>
+std::vector<std::vector<LabelType>> sorted_keys(thrust::host_vector<LabelType>& labels,
+                                                thrust::host_vector<size_t>& offsets)
+{
+  std::vector<std::vector<LabelType>> keys;
+  size_t num_keys = offsets.size() - 1;
+  for (size_t key_id = 0; key_id < num_keys; key_id++) {
+    std::vector<LabelType> cur_key;
+    for (size_t pos = offsets[key_id]; pos < offsets[key_id + 1]; pos++) {
+      cur_key.push_back(labels[pos]);
+    }
+    keys.push_back(cur_key);
+  }
+  sort(keys.begin(), keys.end());
+  return keys;
+}
+
+}  // namespace trie
+}  // namespace test
+}  // namespace cuco