diff --git a/CMakeLists.txt b/CMakeLists.txt
index a00476b4e..2bf5f58e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,12 @@ target_include_directories(cuco INTERFACE
 target_link_libraries(cuco INTERFACE CCCL::CCCL CUDA::toolkit)
 target_compile_features(cuco INTERFACE cxx_std_17 cuda_std_17)
 
+###################################################################################################
+# - Optionally download RoaringFormatSpec test data -----------------------------------------------
+
+option(CUCO_DOWNLOAD_ROARING_TESTDATA "Download RoaringFormatSpec test data" ON)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/roaring_testdata.cmake)
+
 ###################################################################################################
 # - optionally build tests ------------------------------------------------------------------------
 
diff --git a/README.md b/README.md
index 4c04ac88f..ae00028e9 100644
--- a/README.md
+++ b/README.md
@@ -259,4 +259,11 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries.
 
 #### Examples:
-- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw))
\ No newline at end of file
+- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw))
+
+### roaring_bitmap
+
+`cuco::experimental::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
+
+#### Examples:
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WA1v2zYT_iv3qsAgN7aVpB_ZnI_NjdPNWF97sN0VQ1MIlETbhGVRI6k4XpD__h5JfSbK2nV75wCxRR7vnjs-dzzqzpFUSsYT6Qw-3jkscgZHXScmySojK-oMnDCLiNN1JM9EqJ-959cJPIdLnu4FW60VuGEHjg-PX8Hk1_FoPITL6eyX6Wy4GE8nfS1qxN-xkCaSRpAlERWg1hSGKQnxK5_pwq9UaBxw3D8EVwtcO_nctdM5NVr2PIMt2UPCFWSSohomYcliCvQ2pKkClkDIt2nMSBJS2DG1NqZyPQYO_JYr4YEiKE9wRYpPy7okEFVC15-1UunA83a7XZ8Y2H0uVl5shaX3bnx5NZlf9RB6uex9EmNgQdDfMybQ8WAPJEVkIQkQb0x2wAWQlaA4p7hGvhNMsWTVBcmXakcENXoiJpVgQaYawStwov91AQwfSTBwwzmM59cOvBnOx_Ou0fNhvPhp-n4BH4az2XCyGF_NYTrDzZqMxnqr8OktDCe_wc_jyagLFEOHpuhtKrQXCJXpsNLIxnBOaQPGkltYMqUhW7IQCgLBit9QkaBbkFKxZZZqCDIyemK2ZYooM_bIOWPKu06uk2csCeMsonAWZiH3BCcCNfoBU1uS9sNsffFIJlMsZmrvKUGYkv11ml481BQRT6rIC_FfRJcXT06yRLVPqn1KfWugIaDWIpPKi-gNOuLf0FBx0V-3icR8hYSI2yezhGHsJInrKupymvlyLxXdNpYvkQ-UNMcYbxnUvElWjSFrydjxnlsi_2ASbI0K_CCLNz69JUgFimG304FgdAkjusVNxGAoilsp9dbnKdXcLSSn1oLcjDnfZKlRDMNfxrIqFuPEZnZuCXaYL5xgmuw4vDjuoSKwygyRkPQUXr-sDYObcqFMniExt0R1YCn4VqMx-j_OLKQ3RvqtEZkjcz-5Os8lJvoK6Z8FfawlXkO2eKrWdDDDUy4ZRm1f0hoLRLgBZv3X7hbOop8orzKBfNdzIRcCI45jMouRpjAhWxrvu9plDKQyQksex3ynU8hs-MCY6MFH66wucjxTIktkP2CJmXRtlDpf45AXxDzwXh2dnJDoO0-DiIgiXquxzmMo_x6ORyCKTc-Z9vplhcPS4x_E8fql12quU5L4BzykKCzqPM7Pgnzrt2SDOZLq4ge90eX7y6k_mn6YvJsOR_5sOpyNJz_6i6v5YjRcDM-nE31IBFhhqSoTxdRGzP40xqzDsoHlKEH-wM90v8DfmMUB57Flo4u4BwOb8Ug7TNVv8mzxNav8lKg1gr9DvUAytLWiCdXZ7G_oXsI5fPzkdqB3AbY4DQaN6nZWmASjADT3jRF9gGCHgBVTG0cETPoSMfo3xZIu1GYzrLUvjn110SkUAXgeXGLpQg9_zygmmcFTHDgVHe6Ql_cFJ6pMKVT8YwScXQ1H_73qb6NneqinxwozxoU8IG1OGeinhbR2wW0Rgw1G-_AUv87g6FB_9O-Dc_NQiwsYdf00k2s_ILjDm06p-75hBBUbpZW2M-zY7O-Dg82f6XyBTP6s3pOa3m-_TG-bTlsYnyBYWzxdozSgK5a4na41QZPI7RTK74HGukn6ejK-fvkXyNhaEQwV_x9MxBJUcfGRaVv6PkdL497naanFLC1vzfZmcWx3G5-_K56_Ysc_Z2vYtHV0-DdsPRA5vD3OtVWyrSKvGiJPImY2adkDoEwn7rFW8SdwD2-_zRfAAbC_mRlmS780M-4qjmAXHvpESiqUqztobQbPk4gmyl8SFC5rfBd7uPw3bBGQPpbK4oXxKVDk17a6B3f3hX39pR_0NybFFE2ZTNHPNjfzNtaMug-Pqm4uxOVggFQnYm9tYaq7_9FSfcxtjkrR5cJLsyKkQsDZGbrwlqCYuXtpORzQww_t6DGzDiMR59hzX0xQzNB96caPVNn7qGR_0PIoNdr0CHLEKKsad_vbTD7ysVOFZ4gdYKirjumWt9hrY-1pnoFl9AqWVNcH07unLElo1EKaYK_w3A6y5ZIKt0RTMz6j2Hsbr3BjeW5dz5k44x5FrqA4RQVeFpUfEqnOwjURzy_cAosgOz_lRsbMu9ZcX9cx3CGkaMNwrjqMuaRuDUleevMbRel43tuXEbD0xWpPBdsif0mMCBq3kKpfaY67doe_FPZpbeNtt1Q_FlyCCS2xX44jnSJ4BOkXDkg5ltT2rVPSJG-0Go1Xi_cE7B5q5kr8tjfx_AZRJ0CzTOhW8KICYSuE4Z2OvuZyzdQbvLP08jtLzSOywrX5rcSi1_IPr-PWhHyyBnUrFMV03cvi7qSjpw1Zy-LJAOp1ps_FBX4lc17GQY_zpfvIaB1HDgzrQ6070LWPqf3dveVknvcNMzhhsl-3QluitZoe-hlbRngpNk190cvrPt4fjWdlhas346Ap5UdMIO7WVaelmzILQ_1Opvk5t21-e59UKj_AMtd-lSuKdaH9m69S-Fe12cOqqa39GucUFLFVHJHbKp5bGOT121Y0DBOJ0zUxIzmEB4W8tqGFxPdwCAM4wsln-nCsjJVHRvttrnW_8usassCwESsHwzJg3kg2bn7XTtUW5J9r5wuug52HCx8dVLl31p8kYkukqtN19PtRrJOiet_rJDdheHT8KjvCaQsLJ50eqjsPDw6OTqBHRLg-l1v_5BB6PSysCv8p3RxEvZhsA_OGOGZBTWcYhjEO3tiXujigmbZx7rvFPNboxjxWK-f-
\ No newline at end of file
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 34e63fd28..9940c82b9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -28,7 +28,7 @@ function(ConfigureBench BENCH_NAME)
     target_include_directories(${BENCH_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
     target_compile_options(${BENCH_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
-      --compiler-options=-Werror -Wno-deprecated-gpu-targets --expt-extended-lambda)
+      --compiler-options=-Werror -Wno-deprecated-gpu-targets --expt-extended-lambda -lineinfo)
     # Add GCC-specific warning suppression only for GCC
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
       target_compile_options(${BENCH_NAME} PRIVATE -Xcompiler -Wno-subobject-linkage)
@@ -104,3 +104,8 @@ ConfigureBench(HYPERLOGLOG_BENCH
 ConfigureBench(BLOOM_FILTER_BENCH
   bloom_filter/add_bench.cu
   bloom_filter/contains_bench.cu)
+
+###################################################################################################
+# - roaring_bitmap benchmarks ---------------------------------------------------------------------
+ConfigureBench(ROARING_BITMAP_BENCH
+  roaring_bitmap/contains_bench.cu)
\ No newline at end of file
diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu
new file mode 100644
index 000000000..20cb27a27
--- /dev/null
+++ b/benchmarks/roaring_bitmap/contains_bench.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/roaring_bitmap.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <thrust/device_vector.h>
+#include <thrust/universal_vector.h>
+
+#include <filesystem>
+#include <fstream>
+#include <string>
+
+using namespace cuco::benchmark;  // defaults
+using namespace cuco::utility;    // key_generator, distribution
+
+template <typename T>
+void roaring_bitmap_contains(nvbench::state& state, nvbench::type_list<T>)
+{
+  auto const num_items   = state.get_int64("NumInputs");
+  auto const bitmap_file = state.get_string_or_default("BitmapFile", {});
+
+  std::ifstream file(bitmap_file, std::ios::binary);
+  if (!file.is_open()) { state.skip("Bitmap file not found"); }
+
+  // Get file size
+  auto const file_size = std::filesystem::file_size(bitmap_file);
+
+  thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
+
+  file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
+  file.close();
+
+  cuco::experimental::roaring_bitmap<T> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
+
+  thrust::device_vector<T> items(num_items);
+
+  key_generator gen{};
+  gen.generate(distribution::unique{}, items.begin(), items.end());
+
+  thrust::device_vector<bool> contained(items.size(), false);
+
+  state.add_element_count(items.size());
+  state.add_global_memory_reads<T>(items.size(), "InputSize");
+
+  auto& summ = state.add_summary("BitmapSizeMB");
+  summ.set_string("hint", "BitmapSize");
+  summ.set_string("short_name", "BitmapSizeMB");
+  summ.set_string("description", "Bitmap size in MB");
+  summ.set_float64("value", static_cast<double>(file_size) / (1024 * 1024));
+
+  state.exec([&](nvbench::launch& launch) {
+    roaring_bitmap.contains_async(
+      items.begin(), items.end(), contained.begin(), {launch.get_stream()});
+  });
+}
+
+NVBENCH_BENCH_TYPES(roaring_bitmap_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::uint32_t>))
+  .set_name("roaring_bitmap_contains")
+  .add_int64_power_of_two_axis("NumInputs", {32})
+// Default benchmark is only available if the Roaring bitmap testdata has been downloaded
+#ifdef CUCO_ROARING_DATA_DIR
+  .add_string_axis("BitmapFile", {std::string(CUCO_ROARING_DATA_DIR) + "/bitmapwithruns.bin"})
+#endif
+  .set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
+
+NVBENCH_BENCH_TYPES(roaring_bitmap_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::uint64_t>))
+  .set_name("roaring_bitmap_contains")
+  .add_int64_power_of_two_axis("NumInputs", {31})
+// Default benchmark is only available if the Roaring bitmap testdata has been downloaded
+#ifdef CUCO_ROARING_DATA_DIR
+  .add_string_axis("BitmapFile", {std::string(CUCO_ROARING_DATA_DIR) + "/portable_bitmap64.bin"})
+#endif
+  .set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
\ No newline at end of file
diff --git a/cmake/roaring_testdata.cmake b/cmake/roaring_testdata.cmake
new file mode 100644
index 000000000..168519866
--- /dev/null
+++ b/cmake/roaring_testdata.cmake
@@ -0,0 +1,41 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Only act if enabled
+if(NOT CUCO_DOWNLOAD_ROARING_TESTDATA)
+  return()
+endif()
+
+set(CUCO_ROARING_DATA_DIR "${CMAKE_BINARY_DIR}/data/roaring_bitmap")
+
+file(MAKE_DIRECTORY "${CUCO_ROARING_DATA_DIR}")
+
+set(ROARING_FORMATSPEC_BASE "https://raw.githubusercontent.com/RoaringBitmap/RoaringFormatSpec/5177ad9")
+
+rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata/bitmapwithoutruns.bin"
+                                 "${CUCO_ROARING_DATA_DIR}/bitmapwithoutruns.bin"
+                                 "d719ae2e0150a362ef7cf51c361527585891f01460b1a92bcfb6a7257282a442")
+
+rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata/bitmapwithruns.bin"
+                                 "${CUCO_ROARING_DATA_DIR}/bitmapwithruns.bin"
+                                 "1f1909bfdd354fa2f0694fe88b8076833ca5383ad9fc3f68f2709c84a2ab70e3")
+
+rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata64/portable_bitmap64.bin"
+                                 "${CUCO_ROARING_DATA_DIR}/portable_bitmap64.bin"
+                                 "b5a553a759167f5f9ccb3fa21552d943b4c73235635b753376f4faf62067d178")
+
+message(STATUS "Roaring Bitmap test data downloaded to: ${CUCO_ROARING_DATA_DIR}")
+
+# Define macro only when data is available
+add_compile_definitions(CUCO_ROARING_DATA_DIR="${CUCO_ROARING_DATA_DIR}")
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 68e35931f..3318c911e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -52,3 +52,4 @@ ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/
 ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu")
 ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu")
 ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu")
+ConfigureExample(ROARING_BITMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/roaring_bitmap/host_bulk_example.cu")
diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
new file mode 100644
index 000000000..e70e6db3a
--- /dev/null
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/roaring_bitmap.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <thrust/device_vector.h>
+#include <thrust/logical.h>
+#include <thrust/universal_vector.h>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+/**
+ * @file host_bulk_example.cu
+ * @brief Demonstrates usage of the roaring_bitmap "bulk" lookup host APIs.
+ *
+ * In this example we load two 32-bit bitmaps and one 64-bit bitmap (portable format) from the
+ * [RoaringBitmapFormatSpec](https://github.com/RoaringBitmap/RoaringFormatSpec) repository and
+ * check if the bulk lookup API returns the correct results. Namely, we test the following files:
+ * - [bitmapwithoutruns.bin
+ * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithoutruns.bin)
+ * - [bitmapwithruns.bin
+ * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithruns.bin)
+ * - [portable_bitmap64.bin
+ * (64-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/portable_bitmap64.bin)
+ *
+ * @note This example requires the cmake option -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to be set.
+ *
+ */
+
+template <typename KeyType>
+bool check(std::string const& bitmap_file_path)
+{
+  auto generate_keys = []() -> thrust::device_vector<KeyType> {
+    if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint32_t>) {
+      // Create query keys for the bitmapwith{out}runs.bin files:
+      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/README.md#test-data
+      std::vector<cuda::std::uint32_t> keys;
+      for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
+        keys.push_back(k);
+      }
+      for (int k = 100000; k < 200000; ++k) {
+        keys.push_back(3 * k);
+      }
+      for (int k = 700000; k < 800000; ++k) {
+        keys.push_back(k);
+      }
+      return thrust::device_vector<cuda::std::uint32_t>(keys.begin(), keys.end());
+    } else if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint64_t>) {
+      // Create query keys for the portable_bitmap64.bin file:
+      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/README.md#portable_bitmap64bin
+      std::vector<cuda::std::uint64_t> keys;
+      for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) {
+        keys.push_back(k);
+      }
+      for (cuda::std::uint64_t k = 0x0A000ull; k < 0x10000ull; ++k) {
+        keys.push_back(k);
+      }
+      keys.push_back(0x20000ull);
+      keys.push_back(0x20005ull);
+      for (cuda::std::uint64_t i = 0; i < 0x10000ull; i += 2ull) {
+        keys.push_back(0x80000ull + i);
+      }
+      return thrust::device_vector<cuda::std::uint64_t>(keys.begin(), keys.end());
+    } else {
+      static_assert(cuco::dependent_false<KeyType>, "KeyType must be uint32_t or uint64_t");
+      return {};
+    }
+  };
+
+  // Open file
+  std::ifstream file(bitmap_file_path, std::ios::binary);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open " << bitmap_file_path << std::endl;
+    return false;
+  }
+
+  // Get file size
+  auto file_size = std::filesystem::file_size(bitmap_file_path);
+
+  // Allocate host memory for the bitmap file
+  thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
+
+  // Read file into memory
+  file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
+  file.close();
+
+  // Create roaring bitmap from the file
+  cuco::experimental::roaring_bitmap<KeyType> roaring_bitmap(
+    thrust::raw_pointer_cast(buffer.data()));
+
+  // Generate query keys (all should be contained in the bitmap)
+  auto keys = generate_keys();
+
+  // Create a vector to store the results
+  thrust::device_vector<bool> contained(keys.size(), false);
+
+  // Bulk-lookup query keys against the bitmap
+  roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin());
+
+  // Check if all the keys are contained in the bitmap
+  bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
+  return all_contained;
+}
+
+int main()
+{
+#ifdef CUCO_ROARING_DATA_DIR
+  std::string const data_dir = CUCO_ROARING_DATA_DIR;
+  bool success               = check<cuda::std::uint32_t>(data_dir + "/bitmapwithoutruns.bin");
+  success &= check<cuda::std::uint32_t>(data_dir + "/bitmapwithruns.bin");
+  success &= check<cuda::std::uint64_t>(data_dir + "/portable_bitmap64.bin");
+
+  std::cout << "success: " << std::boolalpha << success << std::endl;
+
+  return success ? 0 : 1;
+#else
+  std::cerr << "This example requires CUCO_ROARING_DATA_DIR to be defined (build with cmake option "
+               "-DCUCO_DOWNLOAD_ROARING_TESTDATA=ON)"
+            << std::endl;
+  return 1;
+#endif
+}
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
new file mode 100644
index 000000000..ff8dc3d13
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda/std/cstddef>
+#include <cuda/stream_ref>
+
+namespace cuco::experimental {
+
+template <class T, class Allocator>
+roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
+                                             Allocator const& alloc,
+                                             cuda::stream_ref stream)
+  : storage_{bitmap, alloc, stream}
+{
+}
+
+template <class T, class Allocator>
+template <class InputIt, class OutputIt>
+void roaring_bitmap<T, Allocator>::contains(InputIt first,
+                                            InputIt last,
+                                            OutputIt output,
+                                            cuda::stream_ref stream) const
+{
+  ref_type{storage_.ref()}.contains(first, last, output, stream);
+}
+
+template <class T, class Allocator>
+template <class InputIt, class OutputIt>
+void roaring_bitmap<T, Allocator>::contains_async(InputIt first,
+                                                  InputIt last,
+                                                  OutputIt output,
+                                                  cuda::stream_ref stream) const noexcept
+{
+  ref_type{storage_.ref()}.contains_async(first, last, output, stream);
+}
+
+template <class T, class Allocator>
+cuda::std::size_t roaring_bitmap<T, Allocator>::size() const noexcept
+{
+  return ref_type{storage_.ref()}.size();
+}
+
+template <class T, class Allocator>
+bool roaring_bitmap<T, Allocator>::empty() const noexcept
+{
+  return ref_type{storage_.ref()}.empty();
+}
+
+template <class T, class Allocator>
+cuda::std::byte const* roaring_bitmap<T, Allocator>::data() const noexcept
+{
+  return ref_type{storage_.ref()}.data();
+}
+
+template <class T, class Allocator>
+cuda::std::size_t roaring_bitmap<T, Allocator>::size_bytes() const noexcept
+{
+  return ref_type{storage_.ref()}.size_bytes();
+}
+
+template <class T, class Allocator>
+typename roaring_bitmap<T, Allocator>::allocator_type roaring_bitmap<T, Allocator>::allocator()
+  const noexcept
+{
+  return storage_.allocator();
+}
+
+template <class T, class Allocator>
+typename roaring_bitmap<T, Allocator>::ref_type roaring_bitmap<T, Allocator>::ref() const noexcept
+{
+  return ref_type{storage_.ref()};
+}
+}  // namespace cuco::experimental
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
new file mode 100644
index 000000000..7276dfae8
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh>
+#include <cuco/detail/roaring_bitmap/util.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cub/device/device_transform.cuh>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/functional>
+#include <cuda/std/iterator>
+#include <cuda/stream_ref>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace cuco::experimental::detail {
+
+// primary template
+template <class T>
+class roaring_bitmap_impl {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <>
+class roaring_bitmap_impl<cuda::std::uint32_t> {
+ public:
+  using storage_ref_type = roaring_bitmap_storage_ref<cuda::std::uint32_t>;
+
+  static constexpr cuda::std::uint32_t binary_search_threshold = 8;  // TODO determine optimal value
+
+  __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref)
+    : storage_ref_{storage_ref},
+      offsets_aligned_{(reinterpret_cast<cuda::std::uintptr_t>(
+                         storage_ref_.data() + storage_ref_.metadata().container_offsets)) %
+                         sizeof(cuda::std::uint32_t) ==
+                       0},
+      aligned_16_{(reinterpret_cast<cuda::std::uintptr_t>(storage_ref_.data() +
+                                                          storage_ref_.metadata().key_cards)) %
+                    sizeof(cuda::std::uint16_t) ==
+                  0}  // if base address of key_cards is aligned, then all containers are aligned
+  {
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains(InputIt first,
+                         InputIt last,
+                         OutputIt contained,
+                         cuda::stream_ref stream = {}) const
+  {
+    this->contains_async(first, last, contained, stream);
+    stream.wait();
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt contained,
+                               cuda::stream_ref stream = {}) const noexcept
+  {
+    if (this->empty()) {
+      cub::DeviceTransform::Transform(
+        thrust::constant_iterator<bool>(false),
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>([] __device__(auto /* dummy */) { return false; }),
+        stream.get());
+    } else {
+      cub::DeviceTransform::Transform(
+        first,
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>(
+          [*this] __device__(auto key) { return this->contains(key); }),
+        stream.get());
+    }
+  }
+
+  __device__ bool contains(cuda::std::uint32_t value) const
+  {
+    if (storage_ref_.metadata().num_keys == 0) { return false; }
+
+    if (aligned_16_) {
+      return this->dispatch_contains<true>(value);
+    } else {
+      return this->dispatch_contains<false>(value);
+    }
+  }
+
+  template <bool Aligned>
+  __device__ bool dispatch_contains(cuda::std::uint32_t value) const
+  {
+    cuda::std::uint16_t const upper = value >> 16;
+    cuda::std::uint16_t const lower = value & 0xFFFF;
+    cuda::std::uint16_t key;
+
+    if (storage_ref_.metadata().num_containers < binary_search_threshold) {
+// linear search
+#pragma unroll
+      for (cuda::std::uint32_t i = 0; i < storage_ref_.metadata().num_containers; i++) {
+        cuda::std::byte const* key_ptr =
+          storage_ref_.key_cards() + (i * 2) * sizeof(cuda::std::uint16_t);
+        if constexpr (Aligned) {
+          key = aligned_load<cuda::std::uint16_t>(key_ptr);
+        } else {
+          key = misaligned_load<cuda::std::uint16_t>(key_ptr);
+        }
+        if (key == upper) { return this->contains_container<Aligned>(lower, i); }
+        if (key > upper) { return false; }
+      }
+    } else {
+      // binary search
+      cuda::std::uint32_t left  = 0;
+      cuda::std::uint32_t right = storage_ref_.metadata().num_containers;
+      while (left < right) {
+        cuda::std::uint32_t mid = left + (right - left) / 2;
+        cuda::std::byte const* key_ptr =
+          storage_ref_.key_cards() + (mid * 2) * sizeof(cuda::std::uint16_t);
+        if constexpr (Aligned) {
+          key = aligned_load<cuda::std::uint16_t>(key_ptr);
+        } else {
+          key = misaligned_load<cuda::std::uint16_t>(key_ptr);
+        }
+
+        if (key == upper) {
+          return this->contains_container<Aligned>(lower, mid);
+        } else if (key < upper) {
+          left = mid + 1;
+        } else {
+          right = mid;
+        }
+      }
+    }
+    return false;
+  }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept
+  {
+    return storage_ref_.metadata().num_keys;
+  }
+
+  [[nodiscard]] __host__ __device__ bool empty() const noexcept { return this->size() == 0; }
+
+  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept
+  {
+    return storage_ref_.data();
+  }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept
+  {
+    return storage_ref_.metadata().size_bytes;
+  }
+
+  template <bool Aligned>
+  __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
+  {
+    cuda::std::uint32_t offset;
+    cuda::std::byte const* offset_ptr =
+      storage_ref_.container_offsets() + index * sizeof(cuda::std::uint32_t);
+    if (offsets_aligned_) {
+      offset = aligned_load<cuda::std::uint32_t>(offset_ptr);
+    } else {
+      offset = misaligned_load<cuda::std::uint32_t>(offset_ptr);
+    }
+    cuda::std::byte const* container = storage_ref_.data() + offset;
+    if (storage_ref_.metadata().has_run and check_bit(storage_ref_.run_container_bitmap(), index)) {
+      return this->contains_run_container<Aligned>(container, lower);
+    } else {
+      cuda::std::uint32_t card;
+      cuda::std::byte const* card_ptr =
+        storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t);
+      if constexpr (Aligned) {
+        card = 1u + aligned_load<cuda::std::uint16_t>(card_ptr);
+      } else {
+        card = 1u + misaligned_load<cuda::std::uint16_t>(card_ptr);
+      }
+      if (card <= storage_ref_type::metadata_type::max_array_container_card) {
+        return this->contains_array_container<Aligned>(container, lower, card);
+      } else {
+        return this->contains_bitset_container(container, lower);
+      }
+    }
+  }
+
+  template <bool Aligned>
+  __device__ bool contains_array_container(cuda::std::byte const* container,
+                                           cuda::std::uint16_t lower,
+                                           cuda::std::uint32_t card) const
+  {
+    cuda::std::uint16_t elem;
+    // Use linear search for small arrays, binary search for larger ones
+    if (card < binary_search_threshold) {
+      for (cuda::std::uint32_t i = 0; i < card; i++) {
+        cuda::std::byte const* elem_ptr = container + i * sizeof(cuda::std::uint16_t);
+        if constexpr (Aligned) {
+          elem = aligned_load<cuda::std::uint16_t>(elem_ptr);
+        } else {
+          elem = misaligned_load<cuda::std::uint16_t>(elem_ptr);
+        }
+        if (elem == lower) { return true; }
+      }
+      return false;
+    } else {
+      cuda::std::uint32_t left  = 0;
+      cuda::std::uint32_t right = card;
+
+      while (left < right) {
+        cuda::std::uint32_t mid         = left + (right - left) / 2;
+        cuda::std::byte const* elem_ptr = container + mid * sizeof(cuda::std::uint16_t);
+        if constexpr (Aligned) {
+          elem = aligned_load<cuda::std::uint16_t>(elem_ptr);
+        } else {
+          elem = misaligned_load<cuda::std::uint16_t>(elem_ptr);
+        }
+        if (elem == lower) {
+          return true;
+        } else if (elem < lower) {
+          left = mid + 1;
+        } else {
+          right = mid;
+        }
+      }
+      return false;
+    }
+  }
+
+  __device__ bool contains_bitset_container(cuda::std::byte const* container,
+                                            cuda::std::uint16_t lower) const
+  {
+    return check_bit(container, lower);
+  }
+
+  template <bool Aligned>
+  __device__ bool contains_run_container(cuda::std::byte const* container,
+                                         cuda::std::uint16_t lower) const
+  {
+    // TODO implement binary search
+    cuda::std::uint16_t num_runs;
+    if constexpr (Aligned) {
+      num_runs = aligned_load<cuda::std::uint16_t>(container);
+    } else {
+      num_runs = misaligned_load<cuda::std::uint16_t>(container);
+    }
+
+    cuda::std::uint16_t start;
+    cuda::std::uint32_t end;
+
+    for (cuda::std::uint32_t i = 0; i < num_runs; i++) {
+      // the first 16 bits of the run container denotes the number of runs
+      // followed by the sequence of runs as (start, end) U16 pairs
+      cuda::std::byte const* start_ptr = container + (i * 2 + 1) * sizeof(cuda::std::uint16_t);
+      // TODO load start+end in one instruction
+      if constexpr (Aligned) {
+        start = aligned_load<cuda::std::uint16_t>(start_ptr);
+        end   = static_cast<cuda::std::uint32_t>(start) +
+              aligned_load<cuda::std::uint16_t>(start_ptr + sizeof(cuda::std::uint16_t));
+      } else {
+        start = misaligned_load<cuda::std::uint16_t>(start_ptr);
+        end   = static_cast<cuda::std::uint32_t>(start) +
+              misaligned_load<cuda::std::uint16_t>(start_ptr + sizeof(cuda::std::uint16_t));
+      }
+      if (start <= lower && end >= lower) { return true; }
+      if (start > lower) { break; }
+    }
+    return false;
+  }
+
+  storage_ref_type storage_ref_;
+  bool offsets_aligned_;
+  bool aligned_16_;
+};
+
+template <>
+class roaring_bitmap_impl<cuda::std::uint64_t> {
+ public:
+  using bucket_type      = roaring_bitmap_impl<cuda::std::uint32_t>;
+  using storage_ref_type = roaring_bitmap_storage_ref<cuda::std::uint64_t>;
+
+  __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref)
+    : storage_ref_{storage_ref}
+  {
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains(InputIt first,
+                         InputIt last,
+                         OutputIt contained,
+                         cuda::stream_ref stream = {}) const
+  {
+    this->contains_async(first, last, contained, stream);
+    stream.wait();
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt contained,
+                               cuda::stream_ref stream = {}) const noexcept
+  {
+    if (this->empty()) {
+      cub::DeviceTransform::Transform(
+        thrust::constant_iterator<bool>(false),
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>([] __device__(auto /* dummy */) { return false; }),
+        stream.get());
+    } else {
+      cub::DeviceTransform::Transform(
+        first,
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>(
+          [*this] __device__(auto key) { return this->contains(key); }),
+        stream.get());
+    }
+  }
+
+  __device__ bool contains(cuda::std::uint64_t value) const
+  {
+    cuda::std::uint32_t bucket_key   = value >> 32;
+    cuda::std::uint32_t bucket_value = value & 0xFFFFFFFF;
+
+    // binary search in storage_ref_.buckets()
+    cuda::std::uint32_t left  = 0;
+    cuda::std::uint32_t right = storage_ref_.metadata().num_buckets;
+    while (left < right) {
+      cuda::std::uint32_t mid = left + (right - left) / 2;
+      if (storage_ref_.buckets()[mid].first == bucket_key) {
+        return bucket_type{storage_ref_.buckets()[mid].second}.contains(
+          bucket_value);  // TODO is constructing the ref in-place a bad idea?
+      } else if (storage_ref_.buckets()[mid].first < bucket_key) {
+        left = mid + 1;
+      } else {
+        right = mid;
+      }
+    }
+    return false;
+  }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept
+  {
+    return storage_ref_.metadata().num_keys;
+  }
+
+  [[nodiscard]] __host__ __device__ bool empty() const noexcept { return this->size() == 0; }
+
+  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept
+  {
+    return storage_ref_.data();
+  }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept
+  {
+    return storage_ref_.metadata().size_bytes;
+  }
+
+  storage_ref_type storage_ref_;
+};
+
+}  // namespace cuco::experimental::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
new file mode 100644
index 000000000..01738ac7f
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+#include <cuda/stream_ref>
+
+namespace cuco::experimental {
+
+template <class T>
+__host__ __device__ roaring_bitmap_ref<T>::roaring_bitmap_ref(storage_ref_type const& storage_ref)
+  : impl_{storage_ref}
+{
+}
+
+template <class T>
+template <class U /* = T */,
+          class /* = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>> */>
+__device__ roaring_bitmap_ref<T>::roaring_bitmap_ref(cuda::std::byte const* bitmap) : impl_{bitmap}
+{
+}
+
+template <class T>
+template <class InputIt, class OutputIt>
+__host__ void roaring_bitmap_ref<T>::contains(InputIt first,
+                                              InputIt last,
+                                              OutputIt output,
+                                              cuda::stream_ref stream) const
+{
+  impl_.contains(first, last, output, stream);
+}
+
+template <class T>
+template <class InputIt, class OutputIt>
+__host__ void roaring_bitmap_ref<T>::contains_async(InputIt first,
+                                                    InputIt last,
+                                                    OutputIt output,
+                                                    cuda::stream_ref stream) const noexcept
+{
+  impl_.contains_async(first, last, output, stream);
+}
+
+template <class T>
+__device__ bool roaring_bitmap_ref<T>::contains(T value) const
+{
+  return impl_.contains(value);
+}
+
+template <class T>
+__host__ __device__ cuda::std::size_t roaring_bitmap_ref<T>::size() const noexcept
+{
+  return impl_.size();
+}
+
+template <class T>
+__host__ __device__ bool roaring_bitmap_ref<T>::empty() const noexcept
+{
+  return impl_.empty();
+}
+
+template <class T>
+__host__ __device__ cuda::std::byte const* roaring_bitmap_ref<T>::data() const noexcept
+{
+  return impl_.data();
+}
+
+template <class T>
+__host__ __device__ cuda::std::size_t roaring_bitmap_ref<T>::size_bytes() const noexcept
+{
+  return impl_.size_bytes();
+}
+
+}  // namespace cuco::experimental
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
new file mode 100644
index 000000000..4c33f5ee4
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/roaring_bitmap/util.cuh>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/stream_ref>
+
+#include <memory>
+#include <nv/target>
+#include <utility>
+#include <vector>
+
+namespace cuco::experimental::detail {
+
+template <class T>
+struct roaring_bitmap_storage_ref {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <>
+class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
+ public:
+  using metadata_type = roaring_bitmap_metadata<cuda::std::uint32_t>;
+  __host__ __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap,
+                                                 metadata_type const& metadata)
+    : metadata_{metadata},
+      data_{bitmap},
+      run_container_bitmap_{bitmap + metadata.run_container_bitmap},
+      key_cards_{bitmap + metadata.key_cards},
+      container_offsets_{bitmap + metadata.container_offsets}
+  {
+    assert(metadata.valid);
+  }
+
+  __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap)
+    : roaring_bitmap_storage_ref{bitmap, metadata_type{bitmap}}
+  {
+  }
+
+  __host__ __device__ metadata_type const& metadata() const noexcept { return metadata_; }
+
+  __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; }
+
+  __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; }
+
+  __host__ __device__ cuda::std::byte const* run_container_bitmap() const noexcept
+  {
+    return run_container_bitmap_;
+  }
+
+  __host__ __device__ cuda::std::byte const* key_cards() const noexcept { return key_cards_; }
+
+  __host__ __device__ cuda::std::byte const* container_offsets() const noexcept
+  {
+    return container_offsets_;
+  }
+
+ private:
+  metadata_type metadata_;
+  cuda::std::byte const* data_;
+  cuda::std::byte const* run_container_bitmap_;
+  cuda::std::byte const* key_cards_;
+  cuda::std::byte const* container_offsets_;
+};
+
+template <>
+class roaring_bitmap_storage_ref<cuda::std::uint64_t> {
+ public:
+  using metadata_type = roaring_bitmap_metadata<cuda::std::uint64_t>;
+
+  __host__ __device__ roaring_bitmap_storage_ref(
+    cuda::std::byte const* bitmap,
+    metadata_type const& metadata,
+    cuda::std::pair<cuda::std::uint32_t, roaring_bitmap_storage_ref<cuda::std::uint32_t>>* buckets)
+    : metadata_{metadata}, data_{bitmap}, buckets_{buckets}
+  {
+  }
+
+  __host__ __device__ metadata_type const& metadata() const noexcept { return metadata_; }
+
+  __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; }
+
+  __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; }
+
+  __host__ __device__
+    cuda::std::pair<cuda::std::uint32_t, roaring_bitmap_storage_ref<cuda::std::uint32_t>>*
+    buckets() const noexcept
+  {
+    return buckets_;
+  }
+
+ private:
+  metadata_type metadata_;
+  cuda::std::byte const* data_;
+  cuda::std::pair<cuda::std::uint32_t, roaring_bitmap_storage_ref<cuda::std::uint32_t>>* buckets_;
+};
+
+template <class T, class Allocator>
+struct roaring_bitmap_storage {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <class Allocator>
+class roaring_bitmap_storage<cuda::std::uint32_t, Allocator> {
+ public:
+  using allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::std::byte>;
+  using ref_type = roaring_bitmap_storage_ref<cuda::std::uint32_t>;
+
+  roaring_bitmap_storage(roaring_bitmap_storage const& other)            = default;
+  roaring_bitmap_storage(roaring_bitmap_storage&& other)                 = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage const& other) = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage&& other)      = default;
+
+  ~roaring_bitmap_storage() = default;
+
+  roaring_bitmap_storage(cuda::std::byte const* bitmap,
+                         Allocator const& alloc,
+                         cuda::stream_ref stream)
+    : allocator_{alloc},
+      metadata_{bitmap},
+      data_{allocator_.allocate(metadata_.size_bytes),
+            cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
+                                                                            allocator_}},
+      ref_{data_.get(), metadata_}
+  {
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
+  }
+
+  ref_type ref() const noexcept { return ref_; }
+
+ private:
+  allocator_type allocator_;
+  typename ref_type::metadata_type metadata_;
+  std::unique_ptr<cuda::std::byte, cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>>
+    data_;
+  ref_type ref_;
+};
+
+template <class Allocator>
+class roaring_bitmap_storage<cuda::std::uint64_t, Allocator> {
+ public:
+  using allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::std::byte>;
+  using ref_type              = roaring_bitmap_storage_ref<cuda::std::uint64_t>;
+  using bucket_ref_type       = roaring_bitmap_storage_ref<cuda::std::uint32_t>;
+  using bucket_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
+    cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>>;
+
+  roaring_bitmap_storage(roaring_bitmap_storage const& other)            = default;
+  roaring_bitmap_storage(roaring_bitmap_storage&& other)                 = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage const& other) = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage&& other)      = default;
+
+  ~roaring_bitmap_storage() = default;
+
+  roaring_bitmap_storage(cuda::std::byte const* bitmap,
+                         Allocator const& alloc,
+                         cuda::stream_ref stream)
+    : allocator_{alloc},
+      bucket_allocator_{alloc},
+      bucket_metadata_{},
+      buckets_h_{},
+      metadata_{
+        [bitmap](std::vector<typename ref_type::metadata_type::bucket_metadata>& bucket_metadata) {
+          return typename ref_type::metadata_type{bitmap, bucket_metadata};
+        }(bucket_metadata_)},
+      data_{allocator_.allocate(metadata_.size_bytes),
+            cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
+                                                                            allocator_}},
+      buckets_{bucket_allocator_.allocate(metadata_.num_buckets),
+               cuco::detail::custom_deleter<cuda::std::size_t, bucket_allocator_type>{
+                 metadata_.num_buckets, bucket_allocator_}},
+      ref_{data_.get(), metadata_, buckets_.get()}
+  {
+    assert(metadata_.valid);
+    buckets_h_.reserve(bucket_metadata_.size());
+    for (auto const& meta : bucket_metadata_) {
+      buckets_h_.emplace_back(meta.key,
+                              bucket_ref_type{data_.get() + meta.byte_offset, meta.metadata});
+    }
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      buckets_.get(),
+      buckets_h_.data(),
+      metadata_.num_buckets * sizeof(cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>),
+      cudaMemcpyHostToDevice,
+      stream.get()));
+  }
+
+  ref_type ref() const noexcept { return ref_; }
+
+ private:
+  allocator_type allocator_;
+  bucket_allocator_type bucket_allocator_;
+  std::vector<typename ref_type::metadata_type::bucket_metadata> bucket_metadata_;
+  std::vector<cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>> buckets_h_;
+  typename ref_type::metadata_type metadata_;
+  std::unique_ptr<cuda::std::byte, cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>>
+    data_;
+  std::unique_ptr<cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>,
+                  cuco::detail::custom_deleter<cuda::std::size_t, bucket_allocator_type>>
+    buckets_;
+  ref_type ref_;
+};
+
+}  // namespace cuco::experimental::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh
new file mode 100644
index 000000000..1807b471d
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/util.cuh
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/iterator>
+#include <cuda/std/memory>
+
+#include <nv/target>
+#include <vector>
+
+namespace cuco::experimental::detail {
+
+template <class T>
+__host__ __device__ __forceinline__ T aligned_load(cuda::std::byte const* ptr)
+{
+  return *reinterpret_cast<T const*>(cuda::std::assume_aligned<alignof(T)>(ptr));
+}
+
+template <class T>
+__host__ __device__ __forceinline__ T misaligned_load(cuda::std::byte const* ptr)
+{
+  T value;
+  cuda::std::memcpy(&value, ptr, sizeof(T));
+  return value;
+}
+
+__host__ __device__ __forceinline__ bool check_bit(cuda::std::byte const* bitmap,
+                                                   cuda::std::uint32_t index)
+{
+  // check if the bit at index is set
+  return static_cast<cuda::std::uint8_t>(bitmap[index / 8]) &
+         (cuda::std::uint8_t(1) << (index % 8));
+}
+
+template <class T>
+struct roaring_bitmap_metadata {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <>
+struct roaring_bitmap_metadata<cuda::std::uint32_t> {
+  static constexpr cuda::std::uint32_t max_array_container_card = 4096;
+
+  cuda::std::size_t size_bytes             = 0;
+  cuda::std::size_t num_keys               = 0;
+  cuda::std::uint32_t run_container_bitmap = 0;
+  cuda::std::uint32_t key_cards            = 0;
+  cuda::std::uint32_t container_offsets    = 0;
+  cuda::std::int32_t num_containers        = 0;
+  bool has_run                             = false;
+  bool valid                               = false;
+
+  __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap)
+  {
+    constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
+    constexpr cuda::std::uint32_t serial_cookie                 = 12347;
+    // constexpr cuda::std::uint32_t frozen_cookie                 = 13766; // not implemented
+    constexpr cuda::std::int32_t no_offset_threshold     = 4;
+    constexpr cuda::std::int32_t max_containers          = 1 << 16;
+    constexpr cuda::std::uint32_t cookie_mask            = 0xFFFF;
+    constexpr cuda::std::uint32_t cookie_shift           = 16;
+    constexpr cuda::std::uint32_t bitset_container_bytes = 8192;
+
+    cuda::std::byte const* buf = bitmap;
+
+    cuda::std::uint32_t cookie;
+    cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
+    buf += sizeof(cuda::std::uint32_t);
+    if ((cookie & cookie_mask) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
+      valid = false;
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        CUCO_FAIL(
+          "Invalid bitmap format: cookie type invalid or not supported");)  // TODO device error
+                                                                            // handling
+      return;
+    }
+
+    if ((cookie & cookie_mask) == serial_cookie)
+      // upper 16 bits of cookie are the number of containers - 1
+      num_containers = (cookie >> cookie_shift) + 1;
+    else {
+      // following 4 bytes are the number of containers
+      cuda::std::memcpy(&num_containers, buf, sizeof(cuda::std::uint32_t));
+      buf += sizeof(cuda::std::uint32_t);
+    }
+    if (num_containers < 0 or num_containers > max_containers) {
+      valid = false;
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        CUCO_FAIL(
+          "Invalid bitmap format: num_containers out of range");)  // TODO device error handling
+      return;
+    }
+
+    has_run = (cookie & cookie_mask) == serial_cookie;
+    if (has_run) {
+      cuda::std::size_t s  = (num_containers + 7) / 8;  // ceil bytes to store run container bitmap
+      run_container_bitmap = cuda::std::distance(bitmap, buf);
+      buf += s;
+    }
+
+    key_cards = cuda::std::distance(bitmap, buf);
+    // if the current address is aligned to 2 bytes, then all containers are aligned to at least 2
+    // bytes
+    bool const aligned_16 = (reinterpret_cast<cuda::std::uintptr_t>(bitmap + key_cards) %
+                             sizeof(cuda::std::uint16_t)) == 0;
+    buf += num_containers * 2 * sizeof(cuda::std::uint16_t);
+
+    if ((!has_run) || (num_containers >= no_offset_threshold)) {
+      container_offsets = cuda::std::distance(bitmap, buf);
+      buf += num_containers * sizeof(cuda::std::uint32_t);
+    } else {
+      valid = false;
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        CUCO_FAIL("Invalid bitmap format: not implemented");)  // TODO device error handling
+      return;
+    }
+
+    cuda::std::uint32_t card = 0;
+    for (cuda::std::int32_t i = 0; i < num_containers; i++) {
+      cuda::std::byte const* card_ptr =
+        bitmap + key_cards + (i * 2 + 1) * sizeof(cuda::std::uint16_t);
+      if (aligned_16) {
+        card = 1u + aligned_load<cuda::std::uint16_t>(card_ptr);
+      } else {
+        card = 1u + misaligned_load<cuda::std::uint16_t>(card_ptr);
+      }
+      num_keys += card;
+    }
+
+    // find end of roaring bitmap (re-use card from last container)
+    cuda::std::byte const* end =
+      bitmap + misaligned_load<cuda::std::uint32_t>(
+                 bitmap + container_offsets + (num_containers - 1) * sizeof(cuda::std::uint32_t));
+    if (has_run and check_bit(bitmap + run_container_bitmap, num_containers - 1)) {
+      cuda::std::uint16_t const num_runs = misaligned_load<cuda::std::uint16_t>(end);
+      end += sizeof(cuda::std::uint16_t) + num_runs * 2 * sizeof(cuda::std::uint16_t);
+    } else {
+      if (card <= max_array_container_card) {
+        end += card * sizeof(cuda::std::uint16_t);
+      } else {
+        end += bitset_container_bytes;  // fixed size bitset container
+      }
+    }
+
+    size_bytes = static_cast<cuda::std::size_t>(cuda::std::distance(bitmap, end));
+    valid      = true;
+  }
+};
+
+template <>
+struct roaring_bitmap_metadata<cuda::std::uint64_t> {
+  cuda::std::size_t num_buckets = 0;
+  cuda::std::size_t size_bytes  = 0;
+  cuda::std::size_t num_keys    = 0;
+  bool valid                    = false;
+
+  struct bucket_metadata {
+    cuda::std::size_t byte_offset;
+    cuda::std::uint32_t key;
+    roaring_bitmap_metadata<cuda::std::uint32_t> metadata;
+
+    bucket_metadata(cuda::std::size_t offset,
+                    cuda::std::uint32_t k,
+                    roaring_bitmap_metadata<cuda::std::uint32_t> const& meta)
+      : byte_offset{offset}, key{k}, metadata{meta}
+    {
+    }
+  };
+
+  __host__ roaring_bitmap_metadata(cuda::std::byte const* bitmap,
+                                   std::vector<bucket_metadata>& bucket_metadata)
+  {
+    cuda::std::size_t byte_offset     = 0;
+    cuda::std::byte const* bitmap_ptr = bitmap;
+    cuda::std::memcpy(&num_buckets, bitmap_ptr, sizeof(cuda::std::uint64_t));
+    byte_offset += sizeof(cuda::std::uint64_t);  // skip num_buckets
+
+    bucket_metadata.clear();
+    bucket_metadata.reserve(num_buckets);
+
+    for (cuda::std::size_t i = 0; i < num_buckets; ++i) {
+      cuda::std::uint32_t bucket_key;
+      cuda::std::memcpy(&bucket_key, bitmap_ptr + byte_offset, sizeof(cuda::std::uint32_t));
+      byte_offset += sizeof(cuda::std::uint32_t);  // skip bucket key
+      roaring_bitmap_metadata<cuda::std::uint32_t> bucket_meta{bitmap_ptr + byte_offset};
+      if (!bucket_meta.valid) {
+        valid = false;
+        return;
+      }
+      bucket_metadata.emplace_back(byte_offset, bucket_key, bucket_meta);
+      num_keys += bucket_meta.num_keys;
+      byte_offset += bucket_meta.size_bytes;  // skip bucket
+    }
+    size_bytes = byte_offset;
+    valid      = true;
+  }
+
+  __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap)
+  {
+    cuda::std::size_t byte_offset     = 0;
+    cuda::std::byte const* bitmap_ptr = bitmap;
+    cuda::std::memcpy(&num_buckets, bitmap_ptr, sizeof(cuda::std::uint64_t));
+    byte_offset += sizeof(cuda::std::uint64_t);  // skip num_buckets
+
+    for (cuda::std::size_t i = 0; i < num_buckets; ++i) {
+      byte_offset += sizeof(cuda::std::uint32_t);  // skip bucket key
+      roaring_bitmap_metadata<cuda::std::uint32_t> bucket_meta{bitmap_ptr + byte_offset};
+      if (!bucket_meta.valid) {
+        valid = false;
+        return;
+      }
+      num_keys += bucket_meta.num_keys;
+      byte_offset += bucket_meta.size_bytes;  // skip bucket
+    }
+    size_bytes = byte_offset;
+    valid      = true;
+  }
+};
+}  // namespace cuco::experimental::detail
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
new file mode 100644
index 000000000..e7c083bf3
--- /dev/null
+++ b/include/cuco/roaring_bitmap.cuh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh>
+#include <cuco/roaring_bitmap_ref.cuh>
+#include <cuco/utility/allocator.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/stream_ref>
+
+namespace cuco::experimental {
+
+/**
+ * @brief GPU-accelerated container that owns a serialized Roaring bitmap.
+ *
+ * The `roaring_bitmap` provides host-side bulk membership queries over a bitmap stored in the
+ * [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
+ * The serialized bytes are copied to device-accessible storage upon construction, and queries are
+ * executed on the GPU.
+ *
+ * In addition to bulk host APIs such as `contains`/`contains_async`, this container exposes a
+ * non-owning reference object via `ref()` that can be used for device-side per-thread queries.
+ *
+ * @tparam T Key type. Must be `cuda::std::uint32_t` or `cuda::std::uint64_t`.
+ * @tparam Allocator Allocator type used to manage device-accessible storage for the serialized
+ *                   bytes.
+ */
+template <class T, class Allocator = cuco::cuda_allocator<cuda::std::byte>>
+class roaring_bitmap {
+ public:
+  using value_type     = T;                                             ///< Key type
+  using storage_type   = detail::roaring_bitmap_storage<T, Allocator>;  ///< Storage implementation
+  using allocator_type = typename storage_type::allocator_type;         ///< Allocator type
+  using ref_type       = roaring_bitmap_ref<value_type>;  ///< Non-owning reference type
+
+  /**
+   * @brief Constructs a `roaring_bitmap` by copying the serialized bytes to device-accessible
+   *        storage.
+   *
+   * @param bitmap Pointer to the beginning of the serialized bitmap in host memory
+   * @param alloc Allocator used to allocate device-accessible storage
+   * @param stream CUDA stream used for device memory operations during construction
+   */
+  roaring_bitmap(cuda::std::byte const* bitmap,
+                 Allocator const& alloc  = {},
+                 cuda::stream_ref stream = {});
+
+  roaring_bitmap(roaring_bitmap const& other)            = default;  ///< Copy constructor
+  roaring_bitmap(roaring_bitmap&& other)                 = default;  ///< Move constructor
+  roaring_bitmap& operator=(roaring_bitmap const& other) = default;  ///< Copy assignment
+  roaring_bitmap& operator=(roaring_bitmap&& other)      = default;  ///< Move assignment
+
+  ~roaring_bitmap() = default;  ///< Destructor
+
+  /**
+   * @brief Bulk membership query for keys in `[first, last)`.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   *       `contains_async`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator whose `value_type` is
+   * constructible from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <class InputIt, class OutputIt>
+  void contains(InputIt first,
+                InputIt last,
+                OutputIt contained,
+                cuda::stream_ref stream = {}) const;
+
+  /**
+   * @brief Asynchronously performs a bulk membership query for keys in `[first, last)`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <class InputIt, class OutputIt>
+  void contains_async(InputIt first,
+                      InputIt last,
+                      OutputIt contained,
+                      cuda::stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief Number of keys stored in the bitmap.
+   *
+   * @return Count of keys in the bitmap
+   */
+  [[nodiscard]] cuda::std::size_t size() const noexcept;
+
+  /**
+   * @brief Checks whether the bitmap contains no keys.
+   *
+   * @return `true` iff `size() == 0`
+   */
+  [[nodiscard]] bool empty() const noexcept;
+
+  /**
+   * @brief Returns a pointer to the beginning of the serialized bitmap bytes in device-accessible
+   *        storage.
+   *
+   * @return Pointer to the serialized storage
+   */
+  [[nodiscard]] cuda::std::byte const* data() const noexcept;
+
+  /**
+   * @brief Size in bytes of the serialized bitmap storage.
+   *
+   * @return Number of bytes occupied by the serialized bitmap
+   */
+  [[nodiscard]] cuda::std::size_t size_bytes() const noexcept;
+
+  /**
+   * @brief Returns the allocator used to manage device-accessible storage.
+   *
+   * @return Allocator instance
+   */
+  [[nodiscard]] allocator_type allocator() const noexcept;
+
+  /**
+   * @brief Returns a non-owning reference to the underlying bitmap suitable for device-side use.
+   *
+   * The returned reference type provides device functions such as `contains(T)` for per-thread
+   * membership testing.
+   *
+   * @return Non-owning reference to the underlying bitmap
+   */
+  [[nodiscard]] ref_type ref() const noexcept;
+
+ private:
+  storage_type storage_;  ///< Storage type
+};
+
+}  // namespace cuco::experimental
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap.inl>
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
new file mode 100644
index 000000000..071640a3b
--- /dev/null
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/stream_ref>
+
+namespace cuco::experimental {
+
+/**
+ * @brief Non-owning reference to a Roaring bitmap stored in its serialized format.
+ *
+ * A `roaring_bitmap_ref` provides device and host APIs to query membership against a bitmap that
+ * is laid out according to the [Roaring bitmap format
+ * specification](https://github.com/RoaringBitmap/RoaringFormatSpec). The object does not own the
+ * underlying storage; it simply provides algorithms over the referenced bytes.
+ *
+ * @note The reference reads directly from the serialized representation without deserializing.
+ *       It supports 32-bit and 64-bit key types. For 32-bit bitmaps the layout follows the
+ *       "Standard 32-bit Roaring Bitmap" format; for 64-bit bitmaps, the "portable" format is
+ * supported.
+ *
+ * @tparam T Key type stored in the bitmap. Must be `cuda::std::uint32_t` or `cuda::std::uint64_t`.
+ */
+template <class T>
+class roaring_bitmap_ref {
+  using impl_type = detail::roaring_bitmap_impl<T>;
+
+ public:
+  using value_type       = T;                                     ///< Key type stored in the bitmap
+  using storage_ref_type = typename impl_type::storage_ref_type;  ///< Implementation storage ref
+
+  /**
+   * @brief Constructs a non-owning reference from an implementation-specific storage reference.
+   *
+   * @param storage_ref Reference to the underlying serialized bitmap storage
+   */
+  __host__ __device__ roaring_bitmap_ref(storage_ref_type const& storage_ref);
+
+  /**
+   * @brief Constructs a device-side reference from a raw pointer to a 32-bit Roaring bitmap.
+   *
+   * @note This constructor is only available when `T == cuda::std::uint32_t` and can be used in
+   *       device code to create a lightweight view over device-resident serialized bytes.
+   *
+   * @param bitmap Pointer to the beginning of the serialized bitmap in device memory
+   */
+  template <class U = T,
+            class   = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>>>
+  __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap);
+
+  /**
+   * @brief Bulk membership query for keys in `[first, last)`.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   *       `contains_async`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <class InputIt, class OutputIt>
+  __host__ void contains(InputIt first,
+                         InputIt last,
+                         OutputIt contained,
+                         cuda::stream_ref stream = {}) const;
+
+  /**
+   * @brief Asynchronously performs a bulk membership query for keys in `[first, last)`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt contained,
+                               cuda::stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief Device-side membership query for a single key.
+   *
+   * @param value Key to test for membership
+   *
+   * @return `true` iff `value` is contained in the bitmap
+   */
+  __device__ bool contains(T value) const;
+
+  /**
+   * @brief Number of keys stored in the bitmap.
+   *
+   * @return Count of keys in the bitmap
+   */
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept;
+
+  /**
+   * @brief Checks whether the bitmap contains no keys.
+   *
+   * @return `true` iff `size() == 0`
+   */
+  [[nodiscard]] __host__ __device__ bool empty() const noexcept;
+
+  /**
+   * @brief Returns a pointer to the beginning of the serialized bitmap bytes.
+   *
+   * @return Pointer to the serialized storage
+   */
+  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept;
+
+  /**
+   * @brief Size in bytes of the serialized bitmap storage.
+   *
+   * @return Number of bytes occupied by the serialized bitmap
+   */
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept;
+
+ private:
+  impl_type impl_;
+};
+
+}  // namespace cuco::experimental
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl>
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 21828b360..23258d445 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -152,3 +152,8 @@ ConfigureTest(BLOOM_FILTER_TEST
     bloom_filter/unique_sequence_test.cu
     bloom_filter/arrow_policy_test.cu
     bloom_filter/variable_cg_test.cu)
+
+###################################################################################################
+# - roaring_bitmap ---------------------------------------------------------------------------------
+ConfigureTest(ROARING_BITMAP_TEST
+    roaring_bitmap/contains_test.cu)
diff --git a/tests/roaring_bitmap/contains_test.cu b/tests/roaring_bitmap/contains_test.cu
new file mode 100644
index 000000000..42e0db3c8
--- /dev/null
+++ b/tests/roaring_bitmap/contains_test.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/roaring_bitmap.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <thrust/device_vector.h>
+#include <thrust/logical.h>
+#include <thrust/universal_vector.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+
+namespace {
+template <typename KeyType>
+bool check(std::string const& bitmap_file_path)
+{
+  auto generate_keys = []() -> thrust::device_vector<KeyType> {
+    if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint32_t>) {
+      std::vector<cuda::std::uint32_t> keys;
+      for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
+        keys.push_back(k);
+      }
+      for (int k = 100000; k < 200000; ++k) {
+        keys.push_back(3 * k);
+      }
+      for (int k = 700000; k < 800000; ++k) {
+        keys.push_back(k);
+      }
+      return thrust::device_vector<cuda::std::uint32_t>(keys.begin(), keys.end());
+    } else if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint64_t>) {
+      std::vector<cuda::std::uint64_t> keys;
+      for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) {
+        keys.push_back(k);
+      }
+      for (cuda::std::uint64_t k = 0x0A000ull; k < 0x10000ull; ++k) {
+        keys.push_back(k);
+      }
+      keys.push_back(0x20000ull);
+      keys.push_back(0x20005ull);
+      for (cuda::std::uint64_t i = 0; i < 0x10000ull; i += 2ull) {
+        keys.push_back(0x80000ull + i);
+      }
+      return thrust::device_vector<cuda::std::uint64_t>(keys.begin(), keys.end());
+    } else {
+      static_assert(cuco::dependent_false<KeyType>, "KeyType must be uint32_t or uint64_t");
+      return {};
+    }
+  };
+
+  std::ifstream file(bitmap_file_path, std::ios::binary);
+  if (!file.is_open()) { return false; }
+
+  auto file_size = std::filesystem::file_size(bitmap_file_path);
+
+  thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
+
+  file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
+  file.close();
+
+  cuco::experimental::roaring_bitmap<KeyType> roaring_bitmap(
+    thrust::raw_pointer_cast(buffer.data()));
+
+  auto keys = generate_keys();
+  thrust::device_vector<bool> contained(keys.size(), false);
+
+  roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin());
+
+  bool const all_contained =
+    thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
+  return all_contained;
+}
+}  // namespace
+
+TEST_CASE("roaring_bitmap bulk contains from RoaringFormatSpec testdata", "[roaring_bitmap]")
+{
+#ifndef CUCO_ROARING_DATA_DIR
+  SKIP(
+    "CUCO_ROARING_DATA_DIR is not defined. Configure with -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to "
+    "run this test.");
+#else
+  std::string const data_dir = CUCO_ROARING_DATA_DIR;
+
+  SECTION("32-bit: bitmapwithoutruns.bin")
+  {
+    std::string const path = data_dir + "/bitmapwithoutruns.bin";
+    if (!std::ifstream(path).good()) {
+      std::string const msg = std::string("Missing file: ") + path;
+      SKIP(msg.c_str());
+    }
+    REQUIRE(check<cuda::std::uint32_t>(path));
+  }
+
+  SECTION("32-bit: bitmapwithruns.bin")
+  {
+    std::string const path = data_dir + "/bitmapwithruns.bin";
+    if (!std::ifstream(path).good()) {
+      std::string const msg = std::string("Missing file: ") + path;
+      SKIP(msg.c_str());
+    }
+    REQUIRE(check<cuda::std::uint32_t>(path));
+  }
+
+  SECTION("64-bit: portable_bitmap64.bin")
+  {
+    std::string const path = data_dir + "/portable_bitmap64.bin";
+    if (!std::ifstream(path).good()) {
+      std::string const msg = std::string("Missing file: ") + path;
+      SKIP(msg.c_str());
+    }
+    REQUIRE(check<cuda::std::uint64_t>(path));
+  }
+#endif
+}
\ No newline at end of file