diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index a5f9372aec..54f0a42f5d 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -39,11 +39,11 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q python=3.11 cmake=3.26 make=4.2 swig=4.0 "numpy<2" scipy=1.14 pytest=7.4 gflags=2.2
+        conda install -y -q python=3.11 cmake=3.30.4 make=4.2 swig=4.0 "numpy<2" scipy=1.14 pytest=7.4 gflags=2.2
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then
-          conda install -y -q -c conda-forge openblas=0.3 gxx_linux-aarch64=14.2 sysroot_linux-aarch64=2.17
+          conda install -y -q -c conda-forge openblas=0.3.29 gxx_linux-aarch64=14.2 sysroot_linux-aarch64=2.17
         fi
 
         # install base packages for X86_64
@@ -61,7 +61,7 @@ runs:
           conda install -y -q cuda-toolkit=12.4 -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.12 'cuda-version>=12.0,<=12.5' cuda-toolkit=12.4.1 gxx_linux-64=12.4 -c rapidsai -c conda-forge
+          conda install -y -q libcuvs=25.04 'cuda-version>=12.0,<=12.5' cuda-toolkit=12.4.1 gxx_linux-64=12.4 -c rapidsai -c rapidsai-nightly -c conda-forge
         fi
 
         # install test packages
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
index d2e56d23c3..c27446d840 100644
--- a/.github/actions/build_conda/action.yml
+++ b/.github/actions/build_conda/action.yml
@@ -44,7 +44,7 @@ runs:
         # Ensure starting packages are from conda-forge.
         conda list --show-channel-urls
         conda install -y -q "conda!=24.11.0"
-        conda install -y -q "conda-build!=24.11.0"
+        conda install -y -q "conda-build=25.3.1" "liblief=0.14.1"
         conda list --show-channel-urls
     - name: Enable anaconda uploads
       if: inputs.label != ''
diff --git a/.github/workflows/retry_build.yml b/.github/workflows/retry_build.yml
index ff4e944adf..45c07ffff3 100644
--- a/.github/workflows/retry_build.yml
+++ b/.github/workflows/retry_build.yml
@@ -15,7 +15,10 @@ jobs:
           GH_TOKEN: ${{ github.token }}
           GH_DEBUG: api
         run: |
-          while gh run view ${{ inputs.run_id }} --json status | grep -q in_progress
+          # status can be one of "queued", "in_progress", "completed", "waiting", "requested", "pending"
+          # https://docs.github.com/en/rest/checks/runs
+          # while not completed, sleep for 10 minutes
+          while gh run view ${{ inputs.run_id }} --json status | grep -v completed
           do
             echo Workflow in progress - sleeping for 10 minutes then checking again
             sleep 10m
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c1771f2927..f6826aee8e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,87 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+## [1.11.0] - 2025-04-24
+
+
+Added
+- RaBitQ implementation (#4235)
+- Add RaBitQ to the swigfaiss so we can access its properties correctly in python (#4304)
+- Add date and time to the codec file path so that the file doesn't get overridden with each run (#4303)
+- Add missing header in faiss/CMakeLists.txt (#4285)
+- Implement is_spherical and normalize_L2 booleans as part of the training APIs (#4279)
+- Add normalize_l2 boolean to distributed training API
+- re-land mmap diff (#4250)
+- SearchParameters support for IndexBinaryFlat (#4055)
+- Support non-partition col and map in the embedding reader (#4229)
+- Support cosine distance for training vectors (#4227)
+- Add missing #include in code_distance-sve.h (#4219)
+- Add the support for IndexIDMap with Cagra index (#4188)
+- Add bounds checking to hnsw nb_neighbors (#4185)
+- Add sharding convenience function for IVF indexes (#4150)
+- Added support for building for MinGW, in addition to MSVC (#4145)
+
+Changed
+- Skip mmap test case in AIX. (#4275)
+- Handle insufficient driver gracefully (#4271)
+- relax input params for IndexIVFRaBitQ::get_InvertedListScanner() (#4270)
+- Allow using custom index readers and writers (#4180)
+- Upgrade to libcuvs=25.04 (#4164)
+- ignore regex (#4264)
+- Publish the C API to Conda (#4186)
+- Pass row filters to Hive Reader to filter rows (#4256)
+- Back out "test merge with internal repo" (#4244)
+- test merge with internal repo (#4242)
+- Revert D69972250: Memory-mapping and Zero-copy deserializers
+- Revert D69984379: mem mapping and zero-copy python fixes
+- mem mapping and zero-copy python fixes (#4212)
+- Memory-mapping and Zero-copy deserializers (#4199)
+- Use `nullptr` in faiss/gpu/StandardGpuResources.cpp (#4232)
+- Make static method in header inline (#4214)
+- Upgrade openblas to 0.3.29 for ARM architectures (#4203)
+- Pass `store_dataset` argument along to cuVS CAGRA (#4173)
+- Handle plain SearchParameters in HNSW searches (#4167)
+- Update INSTALL.md to remove some raft references, add missing dependency (#4176)
+- Update README.md (#4169)
+- Update CAGRA docs (#4152)
+- Expose IDSelectorBitmap in the C_API (#4158)
+
+Fixed
+- fix: algorithm of spreading vectors over shards (#4299)
+- Fix overflow of int32 in IndexNSG (#4297)
+- Fix Type Error in Conditional Logic (#4294)
+- faiss/gpu/GpuAutoTune.cpp: fix llvm-19-exposed -Wunused-but-set-variable warnings
+- Fix nightly by pinning conda-build to prevent regression in 25.3.2 (#4287)
+- Fix CQS signal. Id] 88153895 -- readability-redundant-string-init in fbcode/faiss (#4283)
+- Fix a placeholder for 'unimplemented' in mapped_io.cpp (#4268)
+- fix bug: IVFPQ of raft/cuvs does not require redundant check (#4241)
+- fix a serialization problem in RaBitQ (#4261)
+- Grammar fix in FlatIndexHNSW (#4253)
+- Fix CUDA kernel index data type in faiss/gpu/impl/DistanceUtils.cuh +10 (#4246)
+- fix `IVFPQFastScan::RangeSearch()` on the `ARM` architecture (#4247)
+- fix integer overflow issue when calculating imbalance_factor (#4245)
+- Fix bug with metric_arg in IndexHNSW (#4239)
+- Address compile errors and warnings (#4238)
+- faiss: fix non-templated hammings function (#4195)
+- Fix LLVM-19 compilation issue in faiss/AutoTune.cpp (#4220)
+- Fix cloning and reverse index factory for NSG indices (#4151)
+- Remove python_abi to fix nightly (#4217)
+- Fix IVF quantizer centroid sharding so IDs are generated (#4197)
+- Pin lief to fix nightly (#4211)
+- Fix Sapphire Rapids never loading in Python bindings (#4209)
+- Attempt to nightly fix (#4204)
+- Fix nightly by installing earlier version of lief (#4198)
+- Check for not completed
+- Fix install error when building avx512_spr variant (#4170)
+- fix: gpu tests link failure with static lib (#4137)
+- Fix the order of parameters in bench_scalar_quantizer_distance. (#4159)
+
+Deprecated
+- Remove unused exception parameter from faiss/impl/ResultHandler.h (#4243)
+- Remove unused variable (#4205)
+
+
+
 ## [1.10.0] - 2025-01-30
 
 
@@ -459,7 +540,9 @@ by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
 - C bindings.
 - Extended tutorial to GPU indices.
 
-[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.9.0...HEAD
+[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.11.0...HEAD
+[1.11.0]: https://github.com/facebookresearch/faiss/compare/v1.10.0...v1.11.0
+[1.10.0]: https://github.com/facebookresearch/faiss/compare/v1.9.0...v1.10.0
 [1.9.0]: https://github.com/facebookresearch/faiss/compare/v1.8.0...v1.9.0
 [1.8.0]: https://github.com/facebookresearch/faiss/compare/v1.7.4...v1.8.0
 [1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 328c4a5e27..565a0306b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,20 +34,20 @@ if(FAISS_ENABLE_GPU)
 endif()
 
 if(FAISS_ENABLE_CUVS)
-include(cmake/thirdparty/fetch_rapids.cmake)
-include(rapids-cmake)
-include(rapids-cpm)
-include(rapids-cuda)
-include(rapids-export)
-include(rapids-find)
-
-rapids_cuda_init_architectures(faiss)
-rapids_cuda_init_architectures(pyfaiss)
-rapids_cuda_init_architectures(faiss_c_library)
+  include(cmake/thirdparty/fetch_rapids.cmake)
+  include(rapids-cmake)
+  include(rapids-cpm)
+  include(rapids-cuda)
+  include(rapids-export)
+  include(rapids-find)
+
+  rapids_cuda_init_architectures(faiss)
+  rapids_cuda_init_architectures(pyfaiss)
+  rapids_cuda_init_architectures(faiss_c_library)
 endif()
 
 project(faiss
-  VERSION 1.10.0
+  VERSION 1.11.0
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
   LANGUAGES ${FAISS_LANGUAGES})
diff --git a/INSTALL.md b/INSTALL.md
index 6e75826a56..8acbf4563f 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -6,26 +6,26 @@ pre-release nightly builds.
 
 - The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64)
 - faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1
-- faiss-gpu-raft [^1] package containing GPU indices provided by [NVIDIA RAFT](https://github.com/rapidsai/raft/) version 24.06, is available on Linux (x86-64 only) for CUDA 11.8 and 12.4. 
+- faiss-gpu-cuvs package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 24.12, is available on Linux (x86-64 only) for CUDA 11.8 and 12.4.
 
 To install the latest stable release:
 
 ``` shell
 # CPU-only version
-$ conda install -c pytorch faiss-cpu=1.10.0
+$ conda install -c pytorch faiss-cpu=1.11.0
 
 # GPU(+CPU) version
-$ conda install -c pytorch -c nvidia faiss-gpu=1.10.0
+$ conda install -c pytorch -c nvidia faiss-gpu=1.11.0
 
-# GPU(+CPU) version with NVIDIA RAFT
-$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.10.0
+# GPU(+CPU) version with NVIDIA cuVS
+$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge libnvjitlink faiss-gpu-cuvs=1.11.0
 
 # GPU(+CPU) version using AMD ROCm not yet available
 ```
 
 For faiss-gpu, the nvidia channel is required for CUDA, which is not published in the main anaconda channel.
 
-For faiss-gpu-raft, the rapidsai, conda-forge and nvidia channels are required.
+For faiss-gpu-cuvs, the rapidsai, conda-forge and nvidia channels are required.
 
 Nightly pre-release packages can be installed as follows:
 
@@ -34,13 +34,13 @@ Nightly pre-release packages can be installed as follows:
 $ conda install -c pytorch/label/nightly faiss-cpu
 
 # GPU(+CPU) version
-$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.10.0
+$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.11.0
 
 # GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 12.4)
-conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=12.0,<=12.5'
+conda install -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=12.0,<=12.5'
 
 # GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 11.8)
-conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=11.4,<=11.8'
+conda install -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=11.4,<=11.8'
 
 # GPU(+CPU) version using AMD ROCm not yet available
 ```
@@ -321,5 +321,3 @@ and you can run
 $ python demos/demo_auto_tune.py
 ```
 to test the GPU code.
-
-[^1]: The vector search and clustering algorithms in NVIDIA RAFT have been formally migrated to [NVIDIA cuVS](https://github.com/rapidsai/cuvs). This package is being renamed to `faiss-gpu-cuvs` in the next stable release, which will use these GPU implementations from the pre-compiled `libcuvs=24.12` binary.
diff --git a/README.md b/README.md
index 468ba59ab6..1a6949ab45 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ For the GPU version of Faiss, please cite:
 
 ## Join the Faiss community
 
-For public discussion of Faiss or for questions, there is a Facebook group at https://www.facebook.com/groups/faissusers/
+For public discussion of Faiss or for questions, visit https://github.com/facebookresearch/faiss/discussions.
 
 We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository.
 You can report bugs, ask questions, etc.
diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py
index 8b1d65a505..cf0ee958fe 100644
--- a/benchs/bench_fw/descriptors.py
+++ b/benchs/bench_fw/descriptors.py
@@ -83,8 +83,15 @@ class DatasetDescriptor:
 
     embedding_column: Optional[str] = None
 
+    # only when the embedding column is a map
+    embedding_column_key: Optional[Any] = None
+
     embedding_id_column: Optional[str] = None
 
+    # filters on the dataset where each filter is a
+    # string rep of a filter expression
+    filters: Optional[List[str]] = None
+
     # unused in open-source
     splits_distribution: Optional[List[List[bytes]]] = None
 
@@ -106,6 +113,10 @@ class DatasetDescriptor:
     # desc_name
     desc_name: Optional[str] = None
 
+    filename_suffix: Optional[str] = None
+
+    normalize_L2: bool = False
+
     def __hash__(self):
         return hash(self.get_filename())
 
@@ -129,6 +140,8 @@ def get_filename(
             ).replace("=", "_").replace("/", "_")
         if self.num_vectors is not None:
             filename += f"_{self.num_vectors}"
+        if self.filename_suffix is not None:
+            filename += f"_{self.filename_suffix}"
         filename += "."
 
         self.desc_name = filename
@@ -214,6 +227,8 @@ class CodecDescriptor(IndexBaseDescriptor):
     factory: Optional[str] = None
     construction_params: Optional[List[Dict[str, int]]] = None
     training_vectors: Optional[DatasetDescriptor] = None
+    normalize_l2: bool = False
+    is_spherical: bool = False
     FILENAME_PREFIX: str = "xt"
 
     def __post_init__(self):
diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py
index fe2fe103ef..b1252ad1b0 100644
--- a/benchs/bench_fw/index.py
+++ b/benchs/bench_fw/index.py
@@ -1138,6 +1138,8 @@ def assemble(self, dry_run):
                 return None, None, ""
             logger.info(f"assemble, train {self.factory}")
             xt = self.io.get_dataset(self.training_vectors)
+            if self.training_vectors.normalize_L2:
+                faiss.normalize_L2(xt)
             _, t, _ = timer("train", lambda: codec.train(xt), once=True)
             t_aggregate += t
 
diff --git a/c_api/CMakeLists.txt b/c_api/CMakeLists.txt
index cffb8c307c..b2d33c54f1 100644
--- a/c_api/CMakeLists.txt
+++ b/c_api/CMakeLists.txt
@@ -32,9 +32,11 @@ set(FAISS_C_SRC
   index_io_c.cpp
   index_io_c_ex.cpp
   impl/AuxIndexStructures_c.cpp
+  impl/io_c.cpp
   utils/distances_c.cpp
   utils/utils_c.cpp
 )
+
 add_library(faiss_c ${FAISS_C_SRC})
 if(FAISS_OPT_LEVEL STREQUAL "generic")
   target_link_libraries(faiss_c PRIVATE faiss)
@@ -43,13 +45,73 @@ elseif(FAISS_OPT_LEVEL STREQUAL "avx2")
 elseif(FAISS_OPT_LEVEL STREQUAL "avx512")
   target_link_libraries(faiss_c PRIVATE faiss_avx512)
 endif()
-install(TARGETS faiss_c
-  EXPORT faiss-targets
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
+
+add_library(faiss_c_avx2 ${FAISS_C_SRC})
+target_link_libraries(faiss_c_avx2 PRIVATE faiss_avx2)
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  set_target_properties(faiss_c_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
+else()
+  # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
+  # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
+  # Ref. F16C (2nd paragraph): https://walbourn.github.io/directxmath-avx2/
+  # Ref. POPCNT: https://docs.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64
+  target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
+endif()
+
+add_library(faiss_c_avx512 ${FAISS_C_SRC})
+target_link_libraries(faiss_c_avx512 PRIVATE faiss_avx512)
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx512")
+  set_target_properties(faiss_c_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  # All modern CPUs support F, CD, VL, DQ, BW extensions.
+  # Ref: https://en.wikipedia.org/wiki/AVX512
+  target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
+else()
+  target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+endif()
+
+add_library(faiss_c_avx512_spr ${FAISS_C_SRC})
+target_link_libraries(faiss_c_avx512_spr PRIVATE faiss_avx512_spr)
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  set_target_properties(faiss_c_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids.
+  # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
+  target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
+else()
+  target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+endif()
+
+add_library(faiss_c_sve ${FAISS_C_SRC})
+target_link_libraries(faiss_c_sve PRIVATE faiss_sve)
+if(NOT FAISS_OPT_LEVEL STREQUAL "sve")
+  set_target_properties(faiss_c_sve PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
+    # Do nothing, expect SVE to be enabled by -march=native
+  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
+    # Add +sve
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CMAKE_MATCH_2}+sve>)
+  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=armv")
+    # No valid -march, so specify -march=armv8-a+sve as the default
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:-march=armv8-a+sve>)
+  endif()
+  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=native")
+    # Do nothing, expect SVE to be enabled by -march=native
+  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
+    # Add +sve
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${CMAKE_MATCH_2}+sve>)
+  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=armv")
+    # No valid -march, so specify -march=armv8-a+sve as the default
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:-march=armv8-a+sve>)
+  endif()
+endif()
 
 function(faiss_install_headers headers p)
   foreach(h ${headers})
@@ -68,6 +130,42 @@ file(GLOB FAISS_C_API_HEADERS
 
 faiss_install_headers("${FAISS_C_API_HEADERS}" c_api)
 
+install(TARGETS faiss_c
+  EXPORT faiss-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+if(FAISS_OPT_LEVEL STREQUAL "avx2")
+  install(TARGETS faiss_c_avx2
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+if(FAISS_OPT_LEVEL STREQUAL "avx512")
+  install(TARGETS faiss_c_avx2 faiss_c_avx512
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  install(TARGETS faiss_c_avx2 faiss_c_avx512_spr
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+if(FAISS_OPT_LEVEL STREQUAL "sve")
+  install(TARGETS faiss_c_sve
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+
 add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
 target_link_libraries(example_c PRIVATE faiss_c)
 
diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
index 5fdfc34dfd..3c7214a576 100644
--- a/c_api/gpu/CMakeLists.txt
+++ b/c_api/gpu/CMakeLists.txt
@@ -17,9 +17,17 @@ faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)
 
 if (FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
+  target_link_libraries(faiss_c_avx2 PUBLIC hip::host roc::hipblas)
+  target_link_libraries(faiss_c_avx512 PUBLIC hip::host roc::hipblas)
+  target_link_libraries(faiss_c_avx512_spr PUBLIC hip::host roc::hipblas)
+  target_link_libraries(faiss_c_sve PUBLIC hip::host roc::hipblas)
 else()
   find_package(CUDAToolkit REQUIRED)
   target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_c_avx2 PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_c_avx512 PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_c_avx512_spr PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_c_sve PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 endif()
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
diff --git a/c_api/impl/AuxIndexStructures_c.cpp b/c_api/impl/AuxIndexStructures_c.cpp
index 534d5a0769..a7d4f4d5f5 100644
--- a/c_api/impl/AuxIndexStructures_c.cpp
+++ b/c_api/impl/AuxIndexStructures_c.cpp
@@ -19,6 +19,7 @@ using faiss::DistanceComputer;
 using faiss::IDSelector;
 using faiss::IDSelectorAnd;
 using faiss::IDSelectorBatch;
+using faiss::IDSelectorBitmap;
 using faiss::IDSelectorNot;
 using faiss::IDSelectorOr;
 using faiss::IDSelectorRange;
@@ -119,6 +120,23 @@ int faiss_IDSelectorBatch_new(
     CATCH_AND_HANDLE
 }
 
+DEFINE_DESTRUCTOR(IDSelectorBitmap)
+
+DEFINE_GETTER(IDSelectorBitmap, size_t, n)
+DEFINE_GETTER(IDSelectorBitmap, const uint8_t*, bitmap)
+
+int faiss_IDSelectorBitmap_new(
+        FaissIDSelectorBitmap** p_sel,
+        size_t n,
+        const uint8_t* bitmap) {
+    try {
+        *p_sel = reinterpret_cast<FaissIDSelectorBitmap*>(
+                new IDSelectorBitmap(n, bitmap));
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
 int faiss_IDSelectorNot_new(
         FaissIDSelectorNot** p_sel,
         const FaissIDSelector* sel) {
diff --git a/c_api/impl/AuxIndexStructures_c.h b/c_api/impl/AuxIndexStructures_c.h
index 86b017a432..c4be6318bc 100644
--- a/c_api/impl/AuxIndexStructures_c.h
+++ b/c_api/impl/AuxIndexStructures_c.h
@@ -81,6 +81,17 @@ int faiss_IDSelectorBatch_new(
         size_t n,
         const idx_t* indices);
 
+FAISS_DECLARE_CLASS(IDSelectorBitmap)
+FAISS_DECLARE_DESTRUCTOR(IDSelectorBitmap)
+
+FAISS_DECLARE_GETTER(IDSelectorBitmap, size_t, n)
+FAISS_DECLARE_GETTER(IDSelectorBitmap, const uint8_t*, bitmap)
+
+int faiss_IDSelectorBitmap_new(
+        FaissIDSelectorBitmap** p_sel,
+        size_t n,
+        const uint8_t* bitmap);
+
 FAISS_DECLARE_CLASS(IDSelectorNot)
 int faiss_IDSelectorNot_new(
         FaissIDSelectorNot** p_sel,
diff --git a/c_api/impl/io_c.cpp b/c_api/impl/io_c.cpp
new file mode 100644
index 0000000000..58597b97fb
--- /dev/null
+++ b/c_api/impl/io_c.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "io_c.h"
+#include <faiss/impl/io.h>
+#include "../macros_impl.h"
+
+using faiss::IOReader;
+using faiss::IOWriter;
+
+struct CustomIOReader : IOReader {
+    size_t (*func)(void* ptr, size_t size, size_t nitems) = nullptr;
+
+    CustomIOReader(size_t (*func_in)(void* ptr, size_t size, size_t nitems));
+
+    size_t operator()(void* ptr, size_t size, size_t nitems) override;
+};
+
+CustomIOReader::CustomIOReader(
+        size_t (*func_in)(void* ptr, size_t size, size_t nitems))
+        : func(func_in) {}
+
+size_t CustomIOReader::operator()(void* ptr, size_t size, size_t nitems) {
+    return func(ptr, size, nitems);
+}
+
+int faiss_CustomIOReader_new(
+        FaissCustomIOReader** p_out,
+        size_t (*func_in)(void* ptr, size_t size, size_t nitems)) {
+    try {
+        *p_out = reinterpret_cast<FaissCustomIOReader*>(
+                new CustomIOReader(func_in));
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_CustomIOReader_free(FaissCustomIOReader* obj) {
+    delete reinterpret_cast<CustomIOReader*>(obj);
+}
+
+struct CustomIOWriter : IOWriter {
+    size_t (*func)(const void* ptr, size_t size, size_t nitems) = nullptr;
+
+    CustomIOWriter(
+            size_t (*func_in)(const void* ptr, size_t size, size_t nitems));
+
+    size_t operator()(const void* ptr, size_t size, size_t nitems) override;
+};
+
+CustomIOWriter::CustomIOWriter(
+        size_t (*func_in)(const void* ptr, size_t size, size_t nitems))
+        : func(func_in) {}
+
+size_t CustomIOWriter::operator()(const void* ptr, size_t size, size_t nitems) {
+    return func(ptr, size, nitems);
+}
+
+int faiss_CustomIOWriter_new(
+        FaissCustomIOWriter** p_out,
+        size_t (*func_in)(const void* ptr, size_t size, size_t nitems)) {
+    try {
+        *p_out = reinterpret_cast<FaissCustomIOWriter*>(
+                new CustomIOWriter(func_in));
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_CustomIOWriter_free(FaissCustomIOWriter* obj) {
+    delete reinterpret_cast<CustomIOWriter*>(obj);
+}
diff --git a/c_api/impl/io_c.h b/c_api/impl/io_c.h
new file mode 100644
index 0000000000..94a604828d
--- /dev/null
+++ b/c_api/impl/io_c.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c -*-
+
+#ifndef FAISS_IO_C_H
+#define FAISS_IO_C_H
+
+#include <stddef.h>
+#include "../faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FAISS_DECLARE_CLASS(IOReader)
+FAISS_DECLARE_DESTRUCTOR(IOReader)
+
+FAISS_DECLARE_CLASS(IOWriter)
+FAISS_DECLARE_DESTRUCTOR(IOWriter)
+
+/*******************************************************
+ * Custom reader + writer
+ *
+ * Reader and writer which wraps a function pointer,
+ * primarily for FFI use.
+ *******************************************************/
+
+FAISS_DECLARE_CLASS(CustomIOReader)
+FAISS_DECLARE_DESTRUCTOR(CustomIOReader)
+
+int faiss_CustomIOReader_new(
+        FaissCustomIOReader** p_out,
+        size_t (*func_in)(void* ptr, size_t size, size_t nitems));
+
+FAISS_DECLARE_CLASS(CustomIOWriter)
+FAISS_DECLARE_DESTRUCTOR(CustomIOWriter)
+
+int faiss_CustomIOWriter_new(
+        FaissCustomIOWriter** p_out,
+        size_t (*func_in)(const void* ptr, size_t size, size_t nitems));
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/c_api/index_io_c.cpp b/c_api/index_io_c.cpp
index 889f32a251..4e50cd6926 100644
--- a/c_api/index_io_c.cpp
+++ b/c_api/index_io_c.cpp
@@ -15,6 +15,8 @@
 
 using faiss::Index;
 using faiss::IndexBinary;
+using faiss::IOReader;
+using faiss::IOWriter;
 using faiss::VectorTransform;
 
 int faiss_write_index(const FaissIndex* idx, FILE* f) {
@@ -31,6 +33,19 @@ int faiss_write_index_fname(const FaissIndex* idx, const char* fname) {
     CATCH_AND_HANDLE
 }
 
+int faiss_write_index_custom(
+        const FaissIndex* idx,
+        FaissIOWriter* io_writer,
+        int io_flags) {
+    try {
+        faiss::write_index(
+                reinterpret_cast<const Index*>(idx),
+                reinterpret_cast<IOWriter*>(io_writer),
+                io_flags);
+    }
+    CATCH_AND_HANDLE
+}
+
 int faiss_read_index(FILE* f, int io_flags, FaissIndex** p_out) {
     try {
         auto out = faiss::read_index(f, io_flags);
@@ -50,6 +65,18 @@ int faiss_read_index_fname(
     CATCH_AND_HANDLE
 }
 
+int faiss_read_index_custom(
+        FaissIOReader* io_reader,
+        int io_flags,
+        FaissIndex** p_out) {
+    try {
+        auto out = faiss::read_index(
+                reinterpret_cast<IOReader*>(io_reader), io_flags);
+        *p_out = reinterpret_cast<FaissIndex*>(out);
+    }
+    CATCH_AND_HANDLE
+}
+
 int faiss_write_index_binary(const FaissIndexBinary* idx, FILE* f) {
     try {
         faiss::write_index_binary(reinterpret_cast<const IndexBinary*>(idx), f);
@@ -67,6 +94,17 @@ int faiss_write_index_binary_fname(
     CATCH_AND_HANDLE
 }
 
+int faiss_write_index_binary_custom(
+        const FaissIndexBinary* idx,
+        FaissIOWriter* io_writer) {
+    try {
+        faiss::write_index_binary(
+                reinterpret_cast<const IndexBinary*>(idx),
+                reinterpret_cast<IOWriter*>(io_writer));
+    }
+    CATCH_AND_HANDLE
+}
+
 int faiss_read_index_binary(FILE* f, int io_flags, FaissIndexBinary** p_out) {
     try {
         auto out = faiss::read_index_binary(f, io_flags);
@@ -86,6 +124,18 @@ int faiss_read_index_binary_fname(
     CATCH_AND_HANDLE
 }
 
+int faiss_read_index_binary_custom(
+        FaissIOReader* io_reader,
+        int io_flags,
+        FaissIndexBinary** p_out) {
+    try {
+        auto out = faiss::read_index_binary(
+                reinterpret_cast<IOReader*>(io_reader), io_flags);
+        *p_out = reinterpret_cast<FaissIndexBinary*>(out);
+    }
+    CATCH_AND_HANDLE
+}
+
 int faiss_read_VectorTransform_fname(
         const char* fname,
         FaissVectorTransform** p_out) {
diff --git a/c_api/index_io_c.h b/c_api/index_io_c.h
index fd4da615e5..8e390dc920 100644
--- a/c_api/index_io_c.h
+++ b/c_api/index_io_c.h
@@ -16,6 +16,7 @@
 #include "Index_c.h"
 #include "VectorTransform_c.h"
 #include "faiss_c.h"
+#include "impl/io_c.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,6 +33,13 @@ int faiss_write_index(const FaissIndex* idx, FILE* f);
  */
 int faiss_write_index_fname(const FaissIndex* idx, const char* fname);
 
+/** Write index to a custom writer.
+ */
+int faiss_write_index_custom(
+        const FaissIndex* idx,
+        FaissIOWriter* io_writer,
+        int io_flags);
+
 #define FAISS_IO_FLAG_MMAP 1
 #define FAISS_IO_FLAG_READ_ONLY 2
 
@@ -45,6 +53,13 @@ int faiss_read_index(FILE* f, int io_flags, FaissIndex** p_out);
  */
 int faiss_read_index_fname(const char* fname, int io_flags, FaissIndex** p_out);
 
+/** Read index from a custom reader.
+ */
+int faiss_read_index_custom(
+        FaissIOReader* io_reader,
+        int io_flags,
+        FaissIndex** p_out);
+
 /** Write index to a file.
  * This is equivalent to `faiss::write_index_binary` when a file descriptor is
  * provided.
@@ -59,6 +74,12 @@ int faiss_write_index_binary_fname(
         const FaissIndexBinary* idx,
         const char* fname);
 
+/** Write binary index to a custom writer.
+ */
+int faiss_write_index_binary_custom(
+        const FaissIndexBinary* idx,
+        FaissIOWriter* io_writer);
+
 /** Read index from a file.
  * This is equivalent to `faiss:read_index_binary` when a file descriptor is
  * given.
@@ -73,6 +94,13 @@ int faiss_read_index_binary_fname(
         int io_flags,
         FaissIndexBinary** p_out);
 
+/** Read binary index from a custom reader.
+ */
+int faiss_read_index_binary_custom(
+        FaissIOReader* io_reader,
+        int io_flags,
+        FaissIndexBinary** p_out);
+
 /** Read vector transform from a file.
  * This is equivalent to `faiss:read_VectorTransform` when a file path is given.
  */
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 2ed6a78cf5..ba0dac02c2 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "24.12")
+set(RAPIDS_VERSION "25.04")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/conda/faiss-gpu-cuvs/build-lib.sh b/conda/faiss-gpu-cuvs/build-lib.sh
index 37f0381809..3fd46428dd 100644
--- a/conda/faiss-gpu-cuvs/build-lib.sh
+++ b/conda/faiss-gpu-cuvs/build-lib.sh
@@ -10,6 +10,7 @@ set -e
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
+      -DFAISS_ENABLE_C_API=ON \
       -DBUILD_TESTING=OFF \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
@@ -20,7 +21,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512 faiss_c faiss_c_avx2 faiss_c_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index a8edf41e58..c4de80bc38 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -50,7 +50,7 @@ outputs:
         - {{ compiler('cxx') }} =12.4
         - sysroot_linux-64 =2.17 # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.26.4
+        - cmake >=3.30.4
         - make =4.2 # [not win]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
@@ -65,17 +65,18 @@ outputs:
       host:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
-        - openblas =0.3 # [not x86_64]
-        - libcuvs =24.12
+        - openblas =0.3.29 # [not x86_64]
+        - libcuvs =25.04
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
-        - openblas =0.3 # [not x86_64]
+        - openblas =0.3.29 # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libcuvs =24.12
+        - libcuvs =25.04
         - cuda-version {{ cuda_constraints }}
+        - libnvjitlink
     test:
       requires:
         - conda-build
@@ -100,14 +101,16 @@ outputs:
         - cmake >=3.26.4
         - make =4.2 # [not win]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
+        - mkl =2023.0  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
       host:
+        - mkl =2023.0  # [x86_64]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
+        - mkl =2023.0  # [x86_64]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
diff --git a/conda/faiss-gpu/build-lib.sh b/conda/faiss-gpu/build-lib.sh
index befad80547..71c77e3ca1 100755
--- a/conda/faiss-gpu/build-lib.sh
+++ b/conda/faiss-gpu/build-lib.sh
@@ -16,6 +16,7 @@ fi
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
+      -DFAISS_ENABLE_C_API=ON \
       -DBUILD_TESTING=OFF \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
@@ -26,7 +27,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512 faiss_c faiss_c_avx2 faiss_c_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss-gpu/meta.yaml b/conda/faiss-gpu/meta.yaml
index f15c9556d9..5d5b52c09e 100644
--- a/conda/faiss-gpu/meta.yaml
+++ b/conda/faiss-gpu/meta.yaml
@@ -57,10 +57,10 @@ outputs:
         - gcc_linux-64 =11.2  # [cudatoolkit == '11.4.4']
       host:
         - mkl =2023.0  # [x86_64]
-        - openblas =0.3 # [not x86_64]
+        - openblas =0.3.29 # [not x86_64]
       run:
         - mkl =2023.0  # [x86_64]
-        - openblas =0.3 # [not x86_64]
+        - openblas =0.3.29 # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
     test:
@@ -89,12 +89,15 @@ outputs:
         - make =4.4 # [osx and arm64]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64 and not win]
         - cuda-toolkit {{ cudatoolkit }}
+        - mkl-devel =2023.0  # [x86_64]
       host:
+        - mkl =2023.0  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64 and not win]
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
+        - mkl =2023.0  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - packaging
diff --git a/conda/faiss/build-lib-arm64.sh b/conda/faiss/build-lib-arm64.sh
index fbc261515c..e08da7d10b 100755
--- a/conda/faiss/build-lib-arm64.sh
+++ b/conda/faiss/build-lib-arm64.sh
@@ -10,13 +10,15 @@ set -e
 # Build libfaiss.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
+      -DFAISS_ENABLE_C_API=ON \
       -DBUILD_TESTING=OFF \
+      -DFAISS_OPT_LEVEL=sve \
       -DFAISS_ENABLE_GPU=OFF \
       -DFAISS_ENABLE_PYTHON=OFF \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss
+make -C _build -j$(nproc) faiss faiss_sve faiss_c faiss_c_sve
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss/build-lib-osx.sh b/conda/faiss/build-lib-osx.sh
index ad099b46e3..3de5f650a1 100755
--- a/conda/faiss/build-lib-osx.sh
+++ b/conda/faiss/build-lib-osx.sh
@@ -10,6 +10,7 @@ set -e
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
+      -DFAISS_ENABLE_C_API=ON \
       -DBUILD_TESTING=OFF \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=OFF \
@@ -21,7 +22,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512 faiss_c faiss_c_avx2 faiss_c_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss/build-lib.sh b/conda/faiss/build-lib.sh
index 8c986d5e68..2db92e890d 100755
--- a/conda/faiss/build-lib.sh
+++ b/conda/faiss/build-lib.sh
@@ -10,6 +10,7 @@ set -e
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
+      -DFAISS_ENABLE_C_API=ON \
       -DBUILD_TESTING=OFF \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=OFF \
@@ -18,7 +19,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512 faiss_c faiss_c_avx2 faiss_c_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss/meta.yaml b/conda/faiss/meta.yaml
index 947b136179..81f2cc6501 100644
--- a/conda/faiss/meta.yaml
+++ b/conda/faiss/meta.yaml
@@ -45,43 +45,37 @@ outputs:
         - make =4.4 # [osx and arm64]
         {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
         - mkl-devel =2023.0  # [x86_64]
-        - liblief =0.12.3  # [not win]
         - python_abi <3.12
         {% elif PY_VER == '3.12' %}
         - mkl-devel >=2023.2.0  # [x86_64 and not win]
         - mkl-devel =2023.1.0  # [x86_64 and win]
-        - liblief =0.15.1  # [not win]
         - python_abi =3.12
         {% endif %}
       host:
         - python {{ python }}
         {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
         - mkl =2023.0  # [x86_64]
-        - liblief =0.12.3  # [not win]
         - python_abi <3.12
         {% elif PY_VER == '3.12' %}
         - mkl >=2023.2.0  # [x86_64 and not win]
         - mkl =2023.1.0  # [x86_64 and win]
-        - liblief =0.15.1  # [not win]
         - python_abi =3.12
         {% endif %}
-        - openblas =0.3 # [not x86_64]
+        - openblas =0.3.29 # [not x86_64]
       run:
         - python {{ python }}
         {% if PY_VER == '3.9' or PY_VER == '3.10' or PY_VER == '3.11' %}
         - mkl =2023.0  # [x86_64]
-        - liblief =0.12.3  # [not win]
         - python_abi <3.12
         {% elif PY_VER == '3.12' %}
         - mkl >=2023.2.0  # [x86_64 and not win]
         - mkl =2023.1.0  # [x86_64 and win]
-        - liblief =0.15.1  # [not win]
         - python_abi =3.12
         {% endif %}
-        - openblas =0.3 # [not x86_64]
+        - openblas =0.3.29 # [not x86_64]
     test:
       requires:
-        - conda-build
+        - conda-build =25.1.2
       commands:
         - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
         - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
diff --git a/demos/demo_residual_quantizer.cpp b/demos/demo_residual_quantizer.cpp
index cf9c0cdf85..2f88be38f1 100644
--- a/demos/demo_residual_quantizer.cpp
+++ b/demos/demo_residual_quantizer.cpp
@@ -103,7 +103,7 @@ int main() {
         index.is_trained = true;
 
         // override vectors
-        index.codes = raw_codes;
+        index.codes = faiss::MaybeOwnedVector<uint8_t>(raw_codes);
         index.ntotal = nb;
 
         tic();
diff --git a/faiss/AutoTune.cpp b/faiss/AutoTune.cpp
index 35aa6d54af..438559dc3f 100644
--- a/faiss/AutoTune.cpp
+++ b/faiss/AutoTune.cpp
@@ -15,7 +15,6 @@
 
 #include <cinttypes>
 #include <cmath>
-#include <typeinfo>
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/random.h>
@@ -315,9 +314,6 @@ bool ParameterSpace::combination_ge(size_t c1, size_t c2) const {
     return true;
 }
 
-#define DC(classname) \
-    const classname* ix = dynamic_cast<const classname*>(index)
-
 static void init_pq_ParameterRange(
         const ProductQuantizer& pq,
         ParameterRange& pr) {
@@ -341,6 +337,10 @@ ParameterRange& ParameterSpace::add_range(const std::string& name) {
     return parameter_ranges.back();
 }
 
+// Do not use this macro if ix will be unused
+#define DC(classname) \
+    const classname* ix = dynamic_cast<const classname*>(index)
+
 /// initialize with reasonable parameters for this type of index
 void ParameterSpace::initialize(const Index* index) {
     if (DC(IndexPreTransform)) {
@@ -396,7 +396,7 @@ void ParameterSpace::initialize(const Index* index) {
                     std::numeric_limits<double>::infinity());
         }
     }
-    if (DC(IndexIVFPQR)) {
+    if (dynamic_cast<const IndexIVFPQR*>(index)) {
         ParameterRange& pr = add_range("k_factor");
         for (int i = 0; i <= 6; i++) {
             pr.values.push_back(1 << i);
@@ -412,9 +412,6 @@ void ParameterSpace::initialize(const Index* index) {
 
 #undef DC
 
-// non-const version
-#define DC(classname) classname* ix = dynamic_cast<classname*>(index)
-
 /// set a combination of parameters on an index
 void ParameterSpace::set_index_parameters(Index* index, size_t cno) const {
     for (int i = 0; i < parameter_ranges.size(); i++) {
@@ -444,6 +441,10 @@ void ParameterSpace::set_index_parameters(
     }
 }
 
+// non-const version
+// Do not use this macro if ix will be unused
+#define DC(classname) classname* ix = dynamic_cast<classname*>(index)
+
 void ParameterSpace::set_index_parameter(
         Index* index,
         const std::string& name,
@@ -576,6 +577,8 @@ void ParameterSpace::set_index_parameter(
             name.c_str());
 }
 
+#undef DC
+
 void ParameterSpace::display() const {
     printf("ParameterSpace, %zd parameters, %zd combinations:\n",
            parameter_ranges.size(),
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 32b45c204d..93bc932a4d 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -28,6 +28,7 @@ set(FAISS_SRC
   IndexIVFAdditiveQuantizerFastScan.cpp
   IndexIVFPQFastScan.cpp
   IndexIVFPQR.cpp
+  IndexIVFRaBitQ.cpp
   IndexIVFSpectralHash.cpp
   IndexLSH.cpp
   IndexNNDescent.cpp
@@ -39,6 +40,7 @@ set(FAISS_SRC
   IndexIVFIndependentQuantizer.cpp
   IndexPQFastScan.cpp
   IndexPreTransform.cpp
+  IndexRaBitQ.cpp
   IndexRefine.cpp
   IndexReplicas.cpp
   IndexRowwiseMinMax.cpp
@@ -61,6 +63,7 @@ set(FAISS_SRC
   impl/PolysemousTraining.cpp
   impl/ProductQuantizer.cpp
   impl/AdditiveQuantizer.cpp
+  impl/RaBitQuantizer.cpp
   impl/ResidualQuantizer.cpp
   impl/LocalSearchQuantizer.cpp
   impl/ProductAdditiveQuantizer.cpp
@@ -70,12 +73,12 @@ set(FAISS_SRC
   impl/io.cpp
   impl/kmeans1d.cpp
   impl/lattice_Zn.cpp
+  impl/mapped_io.cpp
   impl/pq4_fast_scan.cpp
   impl/pq4_fast_scan_search_1.cpp
   impl/pq4_fast_scan_search_qbs.cpp
   impl/residual_quantizer_encode_steps.cpp
-  impl/io.cpp
-  impl/lattice_Zn.cpp
+  impl/zerocopy_io.cpp
   impl/NNDescent.cpp
   invlists/BlockInvertedLists.cpp
   invlists/DirectMap.cpp
@@ -124,6 +127,7 @@ set(FAISS_HEADERS
   IndexIVFAdditiveQuantizerFastScan.h
   IndexIVFPQFastScan.h
   IndexIVFPQR.h
+  IndexIVFRaBitQ.h
   IndexIVFSpectralHash.h
   IndexLSH.h
   IndexNeuralNetCodec.h
@@ -137,6 +141,7 @@ set(FAISS_HEADERS
   IndexPreTransform.h
   IndexRefine.h
   IndexReplicas.h
+  IndexRaBitQ.h
   IndexRowwiseMinMax.h
   IndexScalarQuantizer.h
   IndexShards.h
@@ -160,12 +165,14 @@ set(FAISS_HEADERS
   impl/LocalSearchQuantizer.h
   impl/ProductAdditiveQuantizer.h
   impl/LookupTableScaler.h
+  impl/maybe_owned_vector.h
   impl/NNDescent.h
   impl/NSG.h
   impl/PolysemousTraining.h
   impl/ProductQuantizer-inl.h
   impl/ProductQuantizer.h
   impl/Quantizer.h
+  impl/RaBitQuantizer.h
   impl/ResidualQuantizer.h
   impl/ResultHandler.h
   impl/ScalarQuantizer.h
@@ -357,6 +364,10 @@ if(WIN32)
   target_compile_definitions(faiss_sve PRIVATE FAISS_MAIN_LIB)
 endif()
 
+if(WIN32)
+  set_target_properties(faiss PROPERTIES LINK_FLAGS "-Wl,--export-all-symbols")
+endif()
+
 string(FIND "${CMAKE_CXX_FLAGS}" "FINTEGER" finteger_idx)
 if (${finteger_idx} EQUAL -1)
   target_compile_definitions(faiss PRIVATE FINTEGER=int)
@@ -439,7 +450,7 @@ if(FAISS_OPT_LEVEL STREQUAL "avx512")
   )
 endif()
 if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
-  install(TARGETS faiss_avx2 faiss_avx512 faiss_avx512_spr
+  install(TARGETS faiss_avx2 faiss_avx512_spr
     EXPORT faiss-targets
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/faiss/Clustering.cpp b/faiss/Clustering.cpp
index 5a4ad15468..47bd03d797 100644
--- a/faiss/Clustering.cpp
+++ b/faiss/Clustering.cpp
@@ -34,22 +34,6 @@ Clustering::Clustering(int d, int k) : d(d), k(k) {}
 Clustering::Clustering(int d, int k, const ClusteringParameters& cp)
         : ClusteringParameters(cp), d(d), k(k) {}
 
-static double imbalance_factor(int n, int k, int64_t* assign) {
-    std::vector<int> hist(k, 0);
-    for (int i = 0; i < n; i++)
-        hist[assign[i]]++;
-
-    double tot = 0, uf = 0;
-
-    for (int i = 0; i < k; i++) {
-        tot += hist[i];
-        uf += hist[i] * (double)hist[i];
-    }
-    uf = uf * k / (tot * tot);
-
-    return uf;
-}
-
 void Clustering::post_process_centroids() {
     if (spherical) {
         fvec_renorm_L2(d, k, centroids.data());
diff --git a/faiss/IVFlib.cpp b/faiss/IVFlib.cpp
index f6bed2a4c4..bbc022b71b 100644
--- a/faiss/IVFlib.cpp
+++ b/faiss/IVFlib.cpp
@@ -9,6 +9,7 @@
 #include <omp.h>
 
 #include <memory>
+#include <numeric>
 
 #include <faiss/IndexAdditiveQuantizer.h>
 #include <faiss/IndexIVFAdditiveQuantizer.h>
@@ -16,7 +17,9 @@
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/MetaIndexes.h>
+#include <faiss/clone_index.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/index_io.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
@@ -198,12 +201,32 @@ static void shift_and_add(
     memcpy(dst.data() + insert_point, src.data(), src.size() * sizeof(T));
 }
 
+template <class T>
+static void shift_and_add(
+        MaybeOwnedVector<T>& dst,
+        size_t remove,
+        const MaybeOwnedVector<T>& src) {
+    if (remove > 0)
+        memmove(dst.data(),
+                dst.data() + remove,
+                (dst.size() - remove) * sizeof(T));
+    size_t insert_point = dst.size() - remove;
+    dst.resize(insert_point + src.size());
+    memcpy(dst.data() + insert_point, src.data(), src.size() * sizeof(T));
+}
+
 template <class T>
 static void remove_from_begin(std::vector<T>& v, size_t remove) {
     if (remove > 0)
         v.erase(v.begin(), v.begin() + remove);
 }
 
+template <class T>
+static void remove_from_begin(MaybeOwnedVector<T>& v, size_t remove) {
+    if (remove > 0)
+        v.erase(v.begin(), v.begin() + remove);
+}
+
 void SlidingIndexWindow::step(const Index* sub_index, bool remove_oldest) {
     FAISS_THROW_IF_NOT_MSG(
             !remove_oldest || n_slice > 0,
@@ -519,5 +542,195 @@ void ivf_residual_add_from_flat_codes(
     index->ntotal += nb;
 }
 
+int64_t DefaultShardingFunction::operator()(int64_t i, int64_t shard_count) {
+    return i % shard_count;
+}
+
+void handle_ivf(
+        faiss::IndexIVF* index,
+        int64_t shard_count,
+        const std::string& filename_template,
+        ShardingFunction* sharding_function,
+        bool generate_ids) {
+    std::vector<faiss::IndexIVF*> sharded_indexes(shard_count);
+    auto clone = static_cast<faiss::IndexIVF*>(faiss::clone_index(index));
+    clone->quantizer->reset();
+    for (int64_t i = 0; i < shard_count; i++) {
+        sharded_indexes[i] =
+                static_cast<faiss::IndexIVF*>(faiss::clone_index(clone));
+        if (generate_ids) {
+            // Assume the quantizer does not natively support add_with_ids.
+            sharded_indexes[i]->quantizer =
+                    new IndexIDMap2(sharded_indexes[i]->quantizer);
+        }
+    }
+
+    // assign centroids to each sharded Index based on sharding_function, and
+    // add them to the quantizer of each sharded index
+    std::vector<std::vector<float>> sharded_centroids(shard_count);
+    std::vector<std::vector<idx_t>> xids(shard_count);
+    for (int64_t i = 0; i < index->quantizer->ntotal; i++) {
+        int64_t shard_id = (*sharding_function)(i, shard_count);
+        // Since the quantizer does not natively support add_with_ids, we simply
+        // generate them.
+        xids[shard_id].push_back(i);
+        float* reconstructed = new float[index->quantizer->d];
+        index->quantizer->reconstruct(i, reconstructed);
+        sharded_centroids[shard_id].insert(
+                sharded_centroids[shard_id].end(),
+                &reconstructed[0],
+                &reconstructed[index->quantizer->d]);
+        delete[] reconstructed;
+    }
+    for (int64_t i = 0; i < shard_count; i++) {
+        if (generate_ids) {
+            sharded_indexes[i]->quantizer->add_with_ids(
+                    sharded_centroids[i].size() / index->quantizer->d,
+                    sharded_centroids[i].data(),
+                    xids[i].data());
+        } else {
+            sharded_indexes[i]->quantizer->add(
+                    sharded_centroids[i].size() / index->quantizer->d,
+                    sharded_centroids[i].data());
+        }
+    }
+
+    for (int64_t i = 0; i < shard_count; i++) {
+        char fname[256];
+        snprintf(fname, 256, filename_template.c_str(), i);
+        faiss::write_index(sharded_indexes[i], fname);
+    }
+
+    for (int64_t i = 0; i < shard_count; i++) {
+        delete sharded_indexes[i];
+    }
+}
+
+void handle_binary_ivf(
+        faiss::IndexBinaryIVF* index,
+        int64_t shard_count,
+        const std::string& filename_template,
+        ShardingFunction* sharding_function,
+        bool generate_ids) {
+    std::vector<faiss::IndexBinaryIVF*> sharded_indexes(shard_count);
+
+    auto clone = static_cast<faiss::IndexBinaryIVF*>(
+            faiss::clone_binary_index(index));
+    clone->quantizer->reset();
+
+    for (int64_t i = 0; i < shard_count; i++) {
+        sharded_indexes[i] = static_cast<faiss::IndexBinaryIVF*>(
+                faiss::clone_binary_index(clone));
+        if (generate_ids) {
+            // Assume the quantizer does not natively support add_with_ids.
+            sharded_indexes[i]->quantizer =
+                    new IndexBinaryIDMap2(sharded_indexes[i]->quantizer);
+        }
+    }
+
+    // assign centroids to each sharded Index based on sharding_function, and
+    // add them to the quantizer of each sharded index
+    int64_t reconstruction_size = index->quantizer->d / 8;
+    std::vector<std::vector<uint8_t>> sharded_centroids(shard_count);
+    std::vector<std::vector<idx_t>> xids(shard_count);
+    for (int64_t i = 0; i < index->quantizer->ntotal; i++) {
+        int64_t shard_id = (*sharding_function)(i, shard_count);
+        // Since the quantizer does not natively support add_with_ids, we simply
+        // generate them.
+        xids[shard_id].push_back(i);
+        uint8_t* reconstructed = new uint8_t[reconstruction_size];
+        index->quantizer->reconstruct(i, reconstructed);
+        sharded_centroids[shard_id].insert(
+                sharded_centroids[shard_id].end(),
+                &reconstructed[0],
+                &reconstructed[reconstruction_size]);
+        delete[] reconstructed;
+    }
+    for (int64_t i = 0; i < shard_count; i++) {
+        if (generate_ids) {
+            sharded_indexes[i]->quantizer->add_with_ids(
+                    sharded_centroids[i].size() / reconstruction_size,
+                    sharded_centroids[i].data(),
+                    xids[i].data());
+        } else {
+            sharded_indexes[i]->quantizer->add(
+                    sharded_centroids[i].size() / reconstruction_size,
+                    sharded_centroids[i].data());
+        }
+    }
+
+    for (int64_t i = 0; i < shard_count; i++) {
+        char fname[256];
+        snprintf(fname, 256, filename_template.c_str(), i);
+        faiss::write_index_binary(sharded_indexes[i], fname);
+    }
+
+    for (int64_t i = 0; i < shard_count; i++) {
+        delete sharded_indexes[i];
+    }
+}
+
+template <typename IndexType>
+void sharding_helper(
+        IndexType* index,
+        int64_t shard_count,
+        const std::string& filename_template,
+        ShardingFunction* sharding_function,
+        bool generate_ids) {
+    FAISS_THROW_IF_MSG(index->quantizer->ntotal == 0, "No centroids to shard.");
+    FAISS_THROW_IF_MSG(
+            filename_template.find("%d") == std::string::npos,
+            "Invalid filename_template. Must contain format specifier for shard count.");
+
+    DefaultShardingFunction default_sharding_function;
+    if (sharding_function == nullptr) {
+        sharding_function = &default_sharding_function;
+    }
+
+    if (typeid(IndexType) == typeid(faiss::IndexIVF)) {
+        handle_ivf(
+                dynamic_cast<faiss::IndexIVF*>(index),
+                shard_count,
+                filename_template,
+                sharding_function,
+                generate_ids);
+    } else if (typeid(IndexType) == typeid(faiss::IndexBinaryIVF)) {
+        handle_binary_ivf(
+                dynamic_cast<faiss::IndexBinaryIVF*>(index),
+                shard_count,
+                filename_template,
+                sharding_function,
+                generate_ids);
+    }
+}
+
+void shard_ivf_index_centroids(
+        faiss::IndexIVF* index,
+        int64_t shard_count,
+        const std::string& filename_template,
+        ShardingFunction* sharding_function,
+        bool generate_ids) {
+    sharding_helper(
+            index,
+            shard_count,
+            filename_template,
+            sharding_function,
+            generate_ids);
+}
+
+void shard_binary_ivf_index_centroids(
+        faiss::IndexBinaryIVF* index,
+        int64_t shard_count,
+        const std::string& filename_template,
+        ShardingFunction* sharding_function,
+        bool generate_ids) {
+    sharding_helper(
+            index,
+            shard_count,
+            filename_template,
+            sharding_function,
+            generate_ids);
+}
+
 } // namespace ivflib
 } // namespace faiss
diff --git a/faiss/IVFlib.h b/faiss/IVFlib.h
index 6f6a590c72..8a6dd3f630 100644
--- a/faiss/IVFlib.h
+++ b/faiss/IVFlib.h
@@ -14,6 +14,7 @@
  * IndexIVFs embedded within an IndexPreTransform.
  */
 
+#include <faiss/IndexBinaryIVF.h>
 #include <faiss/IndexIVF.h>
 #include <vector>
 
@@ -167,6 +168,47 @@ void ivf_residual_add_from_flat_codes(
         const uint8_t* codes,
         int64_t code_size = -1);
 
+struct ShardingFunction {
+    virtual int64_t operator()(int64_t i, int64_t shard_count) = 0;
+    virtual ~ShardingFunction() = default;
+    ShardingFunction() {}
+    ShardingFunction(const ShardingFunction&) = default;
+    ShardingFunction(ShardingFunction&&) = default;
+    ShardingFunction& operator=(const ShardingFunction&) = default;
+    ShardingFunction& operator=(ShardingFunction&&) = default;
+};
+struct DefaultShardingFunction : ShardingFunction {
+    int64_t operator()(int64_t i, int64_t shard_count) override;
+};
+
+/**
+ * Shards an IVF index centroids by the given sharding function, and writes
+ * the index to the path given by filename_generator. The centroids must already
+ * be added to the index quantizer.
+ *
+ * @param index             The IVF index containing centroids to shard.
+ * @param shard_count       Number of shards.
+ * @param filename_template Template for shard filenames.
+ * @param sharding_function The function to shard by. The default is ith vector
+ *                          mod shard_count.
+ * @param generate_ids      Generates ids using IndexIDMap2. If true, ids will
+ *                          match the default ids in the unsharded index.
+ * @return                  The number of shards written.
+ */
+void shard_ivf_index_centroids(
+        IndexIVF* index,
+        int64_t shard_count = 20,
+        const std::string& filename_template = "shard.%d.index",
+        ShardingFunction* sharding_function = nullptr,
+        bool generate_ids = false);
+
+void shard_binary_ivf_index_centroids(
+        faiss::IndexBinaryIVF* index,
+        int64_t shard_count = 20,
+        const std::string& filename_template = "shard.%d.index",
+        ShardingFunction* sharding_function = nullptr,
+        bool generate_ids = false);
+
 } // namespace ivflib
 } // namespace faiss
 
diff --git a/faiss/Index.h b/faiss/Index.h
index 544086f9ad..2474f08be4 100644
--- a/faiss/Index.h
+++ b/faiss/Index.h
@@ -17,7 +17,7 @@
 #include <typeinfo>
 
 #define FAISS_VERSION_MAJOR 1
-#define FAISS_VERSION_MINOR 10
+#define FAISS_VERSION_MINOR 11
 #define FAISS_VERSION_PATCH 0
 
 // Macro to combine the version components into a single string
diff --git a/faiss/IndexBinaryFlat.cpp b/faiss/IndexBinaryFlat.cpp
index f6e2e218c0..bbb51d7c93 100644
--- a/faiss/IndexBinaryFlat.cpp
+++ b/faiss/IndexBinaryFlat.cpp
@@ -37,8 +37,8 @@ void IndexBinaryFlat::search(
         int32_t* distances,
         idx_t* labels,
         const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
+    // Extract IDSelector from params if present
+    const IDSelector* sel = params ? params->sel : nullptr;
     FAISS_THROW_IF_NOT(k > 0);
 
     const idx_t block_size = query_batch_size;
@@ -60,7 +60,8 @@ void IndexBinaryFlat::search(
                     ntotal,
                     code_size,
                     /* ordered = */ true,
-                    approx_topk_mode);
+                    approx_topk_mode,
+                    sel);
         } else {
             hammings_knn_mc(
                     x + s * code_size,
@@ -70,7 +71,8 @@ void IndexBinaryFlat::search(
                     k,
                     code_size,
                     distances + s * k,
-                    labels + s * k);
+                    labels + s * k,
+                    sel);
         }
     }
 }
@@ -107,9 +109,9 @@ void IndexBinaryFlat::range_search(
         int radius,
         RangeSearchResult* result,
         const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    hamming_range_search(x, xb.data(), n, ntotal, radius, code_size, result);
+    const IDSelector* sel = params ? params->sel : nullptr;
+    hamming_range_search(
+            x, xb.data(), n, ntotal, radius, code_size, result, sel);
 }
 
 } // namespace faiss
diff --git a/faiss/IndexBinaryFlat.h b/faiss/IndexBinaryFlat.h
index f6188322ad..0ce43f3e9d 100644
--- a/faiss/IndexBinaryFlat.h
+++ b/faiss/IndexBinaryFlat.h
@@ -14,6 +14,7 @@
 
 #include <faiss/IndexBinary.h>
 
+#include <faiss/impl/maybe_owned_vector.h>
 #include <faiss/utils/approx_topk/mode.h>
 
 namespace faiss {
@@ -21,7 +22,7 @@ namespace faiss {
 /** Index that stores the full vectors and performs exhaustive search. */
 struct IndexBinaryFlat : IndexBinary {
     /// database vectors, size ntotal * d / 8
-    std::vector<uint8_t> xb;
+    MaybeOwnedVector<uint8_t> xb;
 
     /** Select between using a heap or counting to select the k smallest values
      * when scanning inverted lists.
diff --git a/faiss/IndexFlatCodes.cpp b/faiss/IndexFlatCodes.cpp
index 61cc995ce9..47854cccf2 100644
--- a/faiss/IndexFlatCodes.cpp
+++ b/faiss/IndexFlatCodes.cpp
@@ -112,7 +112,7 @@ CodePacker* IndexFlatCodes::get_CodePacker() const {
 }
 
 void IndexFlatCodes::permute_entries(const idx_t* perm) {
-    std::vector<uint8_t> new_codes(codes.size());
+    MaybeOwnedVector<uint8_t> new_codes(codes.size());
 
     for (idx_t i = 0; i < ntotal; i++) {
         memcpy(new_codes.data() + i * code_size,
diff --git a/faiss/IndexFlatCodes.h b/faiss/IndexFlatCodes.h
index 8e5be6c4a5..5ca420b94a 100644
--- a/faiss/IndexFlatCodes.h
+++ b/faiss/IndexFlatCodes.h
@@ -7,9 +7,11 @@
 
 #pragma once
 
+#include <vector>
+
 #include <faiss/Index.h>
 #include <faiss/impl/DistanceComputer.h>
-#include <vector>
+#include <faiss/impl/maybe_owned_vector.h>
 
 namespace faiss {
 
@@ -21,7 +23,7 @@ struct IndexFlatCodes : Index {
     size_t code_size;
 
     /// encoded dataset, size ntotal * code_size
-    std::vector<uint8_t> codes;
+    MaybeOwnedVector<uint8_t> codes;
 
     IndexFlatCodes();
 
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index a65d68dd35..331df9d025 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -8,9 +8,7 @@
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
-#include <cassert>
 #include <cinttypes>
-#include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -124,7 +122,7 @@ void hnsw_add_vertices(
         int i1 = n;
 
         for (int pt_level = hist.size() - 1;
-             pt_level >= !index_hnsw.init_level0;
+             pt_level >= int(!index_hnsw.init_level0);
              pt_level--) {
             int i0 = i1 - hist[pt_level];
 
@@ -212,7 +210,9 @@ IndexHNSW::IndexHNSW(int d, int M, MetricType metric)
         : Index(d, metric), hnsw(M) {}
 
 IndexHNSW::IndexHNSW(Index* storage, int M)
-        : Index(storage->d, storage->metric_type), hnsw(M), storage(storage) {}
+        : Index(storage->d, storage->metric_type), hnsw(M), storage(storage) {
+    metric_arg = storage->metric_arg;
+}
 
 IndexHNSW::~IndexHNSW() {
     if (own_fields) {
@@ -237,19 +237,19 @@ void hnsw_search(
         idx_t n,
         const float* x,
         BlockResultHandler& bres,
-        const SearchParameters* params_in) {
+        const SearchParameters* params) {
     FAISS_THROW_IF_NOT_MSG(
             index->storage,
             "No storage index, please use IndexHNSWFlat (or variants) "
             "instead of IndexHNSW directly");
-    const SearchParametersHNSW* params = nullptr;
     const HNSW& hnsw = index->hnsw;
 
     int efSearch = hnsw.efSearch;
-    if (params_in) {
-        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
-        efSearch = params->efSearch;
+    if (params) {
+        if (const SearchParametersHNSW* hnsw_params =
+                    dynamic_cast<const SearchParametersHNSW*>(params)) {
+            efSearch = hnsw_params->efSearch;
+        }
     }
     size_t n1 = 0, n2 = 0, ndis = 0, nhops = 0;
 
@@ -294,13 +294,13 @@ void IndexHNSW::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters* params_in) const {
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
 
     using RH = HeapBlockResultHandler<HNSW::C>;
     RH bres(n, distances, labels, k);
 
-    hnsw_search(this, n, x, bres, params_in);
+    hnsw_search(this, n, x, bres, params);
 
     if (is_similarity_metric(this->metric_type)) {
         // we need to revert the negated distances
@@ -408,17 +408,10 @@ void IndexHNSW::search_level_0(
         idx_t* labels,
         int nprobe,
         int search_type,
-        const SearchParameters* params_in) const {
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
-    const SearchParametersHNSW* params = nullptr;
-
-    if (params_in) {
-        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
-    }
-
     storage_idx_t ntotal = hnsw.levels.size();
 
     using RH = HeapBlockResultHandler<HNSW::C>;
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index c796d7e18a..2d983b3c16 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -138,7 +138,7 @@ struct IndexHNSWPQ : IndexHNSW {
     void train(idx_t n, const float* x) override;
 };
 
-/** SQ index topped with with a HNSW structure to access elements
+/** SQ index topped with a HNSW structure to access elements
  *  more efficiently.
  */
 struct IndexHNSWSQ : IndexHNSW {
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
index ece7d0409a..4c98613a3f 100644
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@@ -456,7 +456,7 @@ void IndexIVF::search_preassigned(
 #pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap) num_threads(num_omp_threads)
     {
         std::unique_ptr<InvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs, sel));
+                get_InvertedListScanner(store_pairs, sel, params));
 
         /*****************************************************
          * Depending on parallel_mode, there are two possible ways
@@ -797,7 +797,7 @@ void IndexIVF::range_search_preassigned(
     {
         RangeSearchPartialResult pres(result);
         std::unique_ptr<InvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs, sel));
+                get_InvertedListScanner(store_pairs, sel, params));
         FAISS_THROW_IF_NOT(scanner.get());
         all_pres[omp_get_thread_num()] = &pres;
 
@@ -913,7 +913,8 @@ void IndexIVF::range_search_preassigned(
 
 InvertedListScanner* IndexIVF::get_InvertedListScanner(
         bool /*store_pairs*/,
-        const IDSelector* /* sel */) const {
+        const IDSelector* /* sel */,
+        const IVFSearchParameters* /* params */) const {
     FAISS_THROW_MSG("get_InvertedListScanner not implemented");
 }
 
@@ -1297,6 +1298,14 @@ size_t InvertedListScanner::scan_codes(
 
     if (!keep_max) {
         for (size_t j = 0; j < list_size; j++) {
+            if (sel != nullptr) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                if (!sel->is_member(id)) {
+                    codes += code_size;
+                    continue;
+                }
+            }
+
             float dis = distance_to_code(codes);
             if (dis < simi[0]) {
                 int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
@@ -1307,6 +1316,14 @@ size_t InvertedListScanner::scan_codes(
         }
     } else {
         for (size_t j = 0; j < list_size; j++) {
+            if (sel != nullptr) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                if (!sel->is_member(id)) {
+                    codes += code_size;
+                    continue;
+                }
+            }
+
             float dis = distance_to_code(codes);
             if (dis > simi[0]) {
                 int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h
index ebee506f22..5e1748e23b 100644
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@@ -312,11 +312,14 @@ struct IndexIVF : Index, IndexIVFInterface {
 
     /** Get a scanner for this index (store_pairs means ignore labels)
      *
-     * The default search implementation uses this to compute the distances
+     * The default search implementation uses this to compute the distances.
+     * Use sel instead of params->sel, because sel is initialized with
+     * params->sel, but may get overridden by IndexIVF's internal logic.
      */
     virtual InvertedListScanner* get_InvertedListScanner(
             bool store_pairs = false,
-            const IDSelector* sel = nullptr) const;
+            const IDSelector* sel = nullptr,
+            const IVFSearchParameters* params = nullptr) const;
 
     /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2
      */
diff --git a/faiss/IndexIVFAdditiveQuantizer.cpp b/faiss/IndexIVFAdditiveQuantizer.cpp
index 154fe4ab66..afc6e92805 100644
--- a/faiss/IndexIVFAdditiveQuantizer.cpp
+++ b/faiss/IndexIVFAdditiveQuantizer.cpp
@@ -253,7 +253,8 @@ struct AQInvertedListScannerLUT : AQInvertedListScanner {
 
 InvertedListScanner* IndexIVFAdditiveQuantizer::get_InvertedListScanner(
         bool store_pairs,
-        const IDSelector* sel) const {
+        const IDSelector* sel,
+        const IVFSearchParameters*) const {
     FAISS_THROW_IF_NOT(!sel);
     if (metric_type == METRIC_INNER_PRODUCT) {
         if (aq->search_type == AdditiveQuantizer::ST_decompress) {
diff --git a/faiss/IndexIVFAdditiveQuantizer.h b/faiss/IndexIVFAdditiveQuantizer.h
index dfb22d1110..c999a3f79a 100644
--- a/faiss/IndexIVFAdditiveQuantizer.h
+++ b/faiss/IndexIVFAdditiveQuantizer.h
@@ -52,7 +52,8 @@ struct IndexIVFAdditiveQuantizer : IndexIVF {
 
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
-            const IDSelector* sel) const override;
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
 
     void sa_decode(idx_t n, const uint8_t* codes, float* x) const override;
 
diff --git a/faiss/IndexIVFFlat.cpp b/faiss/IndexIVFFlat.cpp
index eb7b074558..661bae0a9a 100644
--- a/faiss/IndexIVFFlat.cpp
+++ b/faiss/IndexIVFFlat.cpp
@@ -224,7 +224,8 @@ InvertedListScanner* get_InvertedListScanner1(
 
 InvertedListScanner* IndexIVFFlat::get_InvertedListScanner(
         bool store_pairs,
-        const IDSelector* sel) const {
+        const IDSelector* sel,
+        const IVFSearchParameters*) const {
     if (sel) {
         return get_InvertedListScanner1<true>(this, store_pairs, sel);
     } else {
diff --git a/faiss/IndexIVFFlat.h b/faiss/IndexIVFFlat.h
index 919bca2b25..c298b7b7d2 100644
--- a/faiss/IndexIVFFlat.h
+++ b/faiss/IndexIVFFlat.h
@@ -44,7 +44,8 @@ struct IndexIVFFlat : IndexIVF {
 
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
-            const IDSelector* sel) const override;
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
 
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index 7b8ad7a545..f270efa688 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -1350,7 +1350,8 @@ InvertedListScanner* get_InvertedListScanner2(
 
 InvertedListScanner* IndexIVFPQ::get_InvertedListScanner(
         bool store_pairs,
-        const IDSelector* sel) const {
+        const IDSelector* sel,
+        const IVFSearchParameters*) const {
     if (sel) {
         return get_InvertedListScanner2<true>(*this, store_pairs, sel);
     } else {
@@ -1425,7 +1426,7 @@ void IndexIVFPQ::compute_distance_to_codes_for_list(
         float* dist_table) const {
 
     std::unique_ptr<InvertedListScanner> scanner(
-        get_InvertedListScanner(true, nullptr));
+        get_InvertedListScanner(true, nullptr, nullptr));
 
 
     if (dist_table) {
@@ -1477,7 +1478,7 @@ void IndexIVFPQ::compute_distance_table(
         float* dist_table) const {
 
     std::unique_ptr<InvertedListScanner> scanner(
-        get_InvertedListScanner(true, nullptr));
+        get_InvertedListScanner(true, nullptr, nullptr));
 
     scanner->set_query(x);
 
diff --git a/faiss/IndexIVFPQ.h b/faiss/IndexIVFPQ.h
index b13c43b116..95d73a2a54 100644
--- a/faiss/IndexIVFPQ.h
+++ b/faiss/IndexIVFPQ.h
@@ -134,7 +134,8 @@ struct IndexIVFPQ : IndexIVF {
 
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
-            const IDSelector* sel) const override;
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
 
     /// build precomputed table
     void precompute_table();
diff --git a/faiss/IndexIVFRaBitQ.cpp b/faiss/IndexIVFRaBitQ.cpp
new file mode 100644
index 0000000000..f4e61baf34
--- /dev/null
+++ b/faiss/IndexIVFRaBitQ.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexIVFRaBitQ.h>
+
+#include <omp.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/RaBitQuantizer.h>
+
+namespace faiss {
+
+IndexIVFRaBitQ::IndexIVFRaBitQ(
+        Index* quantizer,
+        const size_t d,
+        const size_t nlist,
+        MetricType metric)
+        : IndexIVF(quantizer, d, nlist, 0, metric), rabitq(d, metric) {
+    code_size = rabitq.code_size;
+    invlists->code_size = code_size;
+    is_trained = false;
+
+    by_residual = true;
+}
+
+IndexIVFRaBitQ::IndexIVFRaBitQ() {
+    by_residual = true;
+}
+
+void IndexIVFRaBitQ::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    rabitq.train(n, x);
+}
+
+void IndexIVFRaBitQ::encode_vectors(
+        idx_t n,
+        const float* x,
+        const idx_t* list_nos,
+        uint8_t* codes,
+        bool include_listnos) const {
+    size_t coarse_size = include_listnos ? coarse_code_size() : 0;
+    memset(codes, 0, (code_size + coarse_size) * n);
+
+#pragma omp parallel if (n > 1000)
+    {
+        std::vector<float> centroid(d);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos[i];
+            if (list_no >= 0) {
+                const float* xi = x + i * d;
+                uint8_t* code = codes + i * (code_size + coarse_size);
+
+                // both by_residual and !by_residual lead to the same code
+                quantizer->reconstruct(list_no, centroid.data());
+                rabitq.compute_codes_core(
+                        xi, code + coarse_size, 1, centroid.data());
+
+                if (coarse_size) {
+                    encode_listno(list_no, code);
+                }
+            }
+        }
+    }
+}
+
+void IndexIVFRaBitQ::add_core(
+        idx_t n,
+        const float* x,
+        const idx_t* xids,
+        const idx_t* precomputed_idx,
+        void* inverted_list_context) {
+    FAISS_THROW_IF_NOT(is_trained);
+
+    DirectMapAdd dm_add(direct_map, n, xids);
+
+#pragma omp parallel
+    {
+        std::vector<uint8_t> one_code(code_size);
+        std::vector<float> centroid(d);
+
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = precomputed_idx[i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                int64_t id = xids ? xids[i] : ntotal + i;
+
+                const float* xi = x + i * d;
+
+                // both by_residual and !by_residual lead to the same code
+                quantizer->reconstruct(list_no, centroid.data());
+                rabitq.compute_codes_core(
+                        xi, one_code.data(), 1, centroid.data());
+
+                size_t ofs = invlists->add_entry(
+                        list_no, id, one_code.data(), inverted_list_context);
+
+                dm_add.add(i, list_no, ofs);
+
+            } else if (rank == 0 && list_no == -1) {
+                dm_add.add(i, -1, 0);
+            }
+        }
+    }
+
+    ntotal += n;
+}
+
+struct RaBitInvertedListScanner : InvertedListScanner {
+    const IndexIVFRaBitQ& ivf_rabitq;
+
+    std::vector<float> reconstructed_centroid;
+    std::vector<float> query_vector;
+
+    std::unique_ptr<FlatCodesDistanceComputer> dc;
+
+    uint8_t qb = 0;
+
+    RaBitInvertedListScanner(
+            const IndexIVFRaBitQ& ivf_rabitq_in,
+            bool store_pairs = false,
+            const IDSelector* sel = nullptr,
+            uint8_t qb_in = 0)
+            : InvertedListScanner(store_pairs, sel),
+              ivf_rabitq{ivf_rabitq_in},
+              qb{qb_in} {
+        keep_max = is_similarity_metric(ivf_rabitq.metric_type);
+        code_size = ivf_rabitq.code_size;
+    }
+
+    /// from now on we handle this query.
+    void set_query(const float* query_vector_in) override {
+        query_vector.assign(query_vector_in, query_vector_in + ivf_rabitq.d);
+
+        internal_try_setup_dc();
+    }
+
+    /// following codes come from this inverted list
+    void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+
+        reconstructed_centroid.resize(ivf_rabitq.d);
+        ivf_rabitq.quantizer->reconstruct(
+                list_no, reconstructed_centroid.data());
+
+        internal_try_setup_dc();
+    }
+
+    /// compute a single query-to-code distance
+    float distance_to_code(const uint8_t* code) const override {
+        return dc->distance_to_code(code);
+    }
+
+    void internal_try_setup_dc() {
+        if (!query_vector.empty() && !reconstructed_centroid.empty()) {
+            // both query_vector and centroid are available!
+            // set up DistanceComputer
+            dc.reset(ivf_rabitq.rabitq.get_distance_computer(
+                    qb, reconstructed_centroid.data()));
+
+            dc->set_query(query_vector.data());
+        }
+    }
+};
+
+InvertedListScanner* IndexIVFRaBitQ::get_InvertedListScanner(
+        bool store_pairs,
+        const IDSelector* sel,
+        const IVFSearchParameters* search_params_in) const {
+    uint8_t used_qb = qb;
+    if (auto params = dynamic_cast<const IVFRaBitQSearchParameters*>(
+                search_params_in)) {
+        used_qb = params->qb;
+    }
+
+    return new RaBitInvertedListScanner(*this, store_pairs, sel, used_qb);
+}
+
+void IndexIVFRaBitQ::reconstruct_from_offset(
+        int64_t list_no,
+        int64_t offset,
+        float* recons) const {
+    const uint8_t* code = invlists->get_single_code(list_no, offset);
+
+    std::vector<float> centroid(d);
+    quantizer->reconstruct(list_no, centroid.data());
+
+    rabitq.decode_core(code, recons, 1, centroid.data());
+}
+
+void IndexIVFRaBitQ::sa_decode(idx_t n, const uint8_t* codes, float* x) const {
+    size_t coarse_size = coarse_code_size();
+
+#pragma omp parallel
+    {
+        std::vector<float> centroid(d);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno(code);
+            float* xi = x + i * d;
+
+            quantizer->reconstruct(list_no, centroid.data());
+            rabitq.decode_core(code + coarse_size, xi, 1, centroid.data());
+        }
+    }
+}
+
+struct IVFRaBitDistanceComputer : DistanceComputer {
+    const float* q = nullptr;
+    const IndexIVFRaBitQ* parent = nullptr;
+
+    void set_query(const float* x) override;
+
+    float operator()(idx_t i) override;
+
+    float symmetric_dis(idx_t i, idx_t j) override;
+};
+
+void IVFRaBitDistanceComputer::set_query(const float* x) {
+    q = x;
+}
+
+float IVFRaBitDistanceComputer::operator()(idx_t i) {
+    // find the appropriate list
+    idx_t lo = parent->direct_map.get(i);
+    uint64_t list_no = lo_listno(lo);
+    uint64_t offset = lo_offset(lo);
+
+    const uint8_t* code = parent->invlists->get_single_code(list_no, offset);
+
+    // ok, we know the appropriate cluster that we need
+    std::vector<float> centroid(parent->d);
+    parent->quantizer->reconstruct(list_no, centroid.data());
+
+    // compute the distance
+    float distance = 0;
+
+    std::unique_ptr<FlatCodesDistanceComputer> dc(
+            parent->rabitq.get_distance_computer(parent->qb, centroid.data()));
+    dc->set_query(q);
+    distance = dc->distance_to_code(code);
+
+    // deallocate
+    parent->invlists->release_codes(list_no, code);
+
+    // done
+    return distance;
+}
+
+float IVFRaBitDistanceComputer::symmetric_dis(idx_t i, idx_t j) {
+    FAISS_THROW_MSG("Not implemented");
+}
+
+DistanceComputer* IndexIVFRaBitQ::get_distance_computer() const {
+    IVFRaBitDistanceComputer* dc = new IVFRaBitDistanceComputer;
+    dc->parent = this;
+    return dc;
+}
+
+} // namespace faiss
diff --git a/faiss/IndexIVFRaBitQ.h b/faiss/IndexIVFRaBitQ.h
new file mode 100644
index 0000000000..ca42dfc39d
--- /dev/null
+++ b/faiss/IndexIVFRaBitQ.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/Index.h>
+#include <faiss/IndexIVF.h>
+
+#include <faiss/impl/RaBitQuantizer.h>
+
+namespace faiss {
+
+struct IVFRaBitQSearchParameters : IVFSearchParameters {
+    uint8_t qb = 0;
+};
+
+// * by_residual is true, just by design
+struct IndexIVFRaBitQ : IndexIVF {
+    RaBitQuantizer rabitq;
+
+    // the default number of bits to quantize a query with.
+    // use '0' to disable quantization and use raw fp32 values.
+    uint8_t qb = 0;
+
+    IndexIVFRaBitQ(
+            Index* quantizer,
+            const size_t d,
+            const size_t nlist,
+            MetricType metric = METRIC_L2);
+
+    IndexIVFRaBitQ();
+
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr) override;
+
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs,
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
+
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    // unfortunately
+    DistanceComputer* get_distance_computer() const override;
+};
+
+} // namespace faiss
diff --git a/faiss/IndexIVFSpectralHash.cpp b/faiss/IndexIVFSpectralHash.cpp
index c0a6c0e914..a4f23256a5 100644
--- a/faiss/IndexIVFSpectralHash.cpp
+++ b/faiss/IndexIVFSpectralHash.cpp
@@ -301,7 +301,8 @@ struct BuildScanner {
 
 InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner(
         bool store_pairs,
-        const IDSelector* sel) const {
+        const IDSelector* sel,
+        const IVFSearchParameters*) const {
     FAISS_THROW_IF_NOT(!sel);
     BuildScanner bs;
     return dispatch_HammingComputer(code_size, bs, this, store_pairs);
diff --git a/faiss/IndexIVFSpectralHash.h b/faiss/IndexIVFSpectralHash.h
index 9f11105c11..77541bc6fa 100644
--- a/faiss/IndexIVFSpectralHash.h
+++ b/faiss/IndexIVFSpectralHash.h
@@ -71,7 +71,8 @@ struct IndexIVFSpectralHash : IndexIVF {
 
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
-            const IDSelector* sel) const override;
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
 
     /** replace the vector transform for an empty (and possibly untrained) index
      */
diff --git a/faiss/IndexRaBitQ.cpp b/faiss/IndexRaBitQ.cpp
new file mode 100644
index 0000000000..c4025c4ce3
--- /dev/null
+++ b/faiss/IndexRaBitQ.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexRaBitQ.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ResultHandler.h>
+
+namespace faiss {
+
+IndexRaBitQ::IndexRaBitQ() = default;
+
+IndexRaBitQ::IndexRaBitQ(idx_t d, MetricType metric)
+        : IndexFlatCodes(0, d, metric), rabitq(d, metric) {
+    code_size = rabitq.code_size;
+
+    is_trained = false;
+}
+
+void IndexRaBitQ::train(idx_t n, const float* x) {
+    // compute a centroid
+    std::vector<float> centroid(d, 0);
+    for (size_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d; j++) {
+            centroid[j] += x[i * d + j];
+        }
+    }
+
+    if (n != 0) {
+        for (size_t j = 0; j < d; j++) {
+            centroid[j] /= (float)n;
+        }
+    }
+
+    center = std::move(centroid);
+
+    //
+    rabitq.train(n, x);
+    is_trained = true;
+}
+
+void IndexRaBitQ::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
+    FAISS_THROW_IF_NOT(is_trained);
+    rabitq.compute_codes_core(x, bytes, n, center.data());
+}
+
+void IndexRaBitQ::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
+    FAISS_THROW_IF_NOT(is_trained);
+    rabitq.decode_core(bytes, x, n, center.data());
+}
+
+FlatCodesDistanceComputer* IndexRaBitQ::get_FlatCodesDistanceComputer() const {
+    FlatCodesDistanceComputer* dc =
+            rabitq.get_distance_computer(qb, center.data());
+    dc->code_size = rabitq.code_size;
+    dc->codes = codes.data();
+    return dc;
+}
+
+FlatCodesDistanceComputer* IndexRaBitQ::get_quantized_distance_computer(
+        const uint8_t qb) const {
+    FlatCodesDistanceComputer* dc =
+            rabitq.get_distance_computer(qb, center.data());
+    dc->code_size = rabitq.code_size;
+    dc->codes = codes.data();
+    return dc;
+}
+
+namespace {
+
+struct Run_search_with_dc_res {
+    using T = void;
+
+    uint8_t qb = 0;
+
+    template <class BlockResultHandler>
+    void f(BlockResultHandler& res, const IndexRaBitQ* index, const float* xq) {
+        size_t ntotal = index->ntotal;
+        using SingleResultHandler =
+                typename BlockResultHandler::SingleResultHandler;
+        const int d = index->d;
+
+#pragma omp parallel // if (res.nq > 100)
+        {
+            std::unique_ptr<FlatCodesDistanceComputer> dc(
+                    index->get_quantized_distance_computer(qb));
+            SingleResultHandler resi(res);
+#pragma omp for
+            for (int64_t q = 0; q < res.nq; q++) {
+                resi.begin(q);
+                dc->set_query(xq + d * q);
+                for (size_t i = 0; i < ntotal; i++) {
+                    if (res.is_in_selection(i)) {
+                        float dis = (*dc)(i);
+                        resi.add_result(dis, i);
+                    }
+                }
+                resi.end();
+            }
+        }
+    }
+};
+
+} // namespace
+
+void IndexRaBitQ::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params_in) const {
+    uint8_t used_qb = qb;
+    if (auto params = dynamic_cast<const RaBitQSearchParameters*>(params_in)) {
+        used_qb = params->qb;
+    }
+
+    const IDSelector* sel = (params_in != nullptr) ? params_in->sel : nullptr;
+    Run_search_with_dc_res r;
+    r.qb = used_qb;
+
+    dispatch_knn_ResultHandler(
+            n, distances, labels, k, metric_type, sel, r, this, x);
+}
+
+void IndexRaBitQ::range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const SearchParameters* params_in) const {
+    uint8_t used_qb = qb;
+    if (auto params = dynamic_cast<const RaBitQSearchParameters*>(params_in)) {
+        used_qb = params->qb;
+    }
+
+    const IDSelector* sel = (params_in != nullptr) ? params_in->sel : nullptr;
+    Run_search_with_dc_res r;
+    r.qb = used_qb;
+
+    dispatch_range_ResultHandler(result, radius, metric_type, sel, r, this, x);
+}
+
+} // namespace faiss
diff --git a/faiss/IndexRaBitQ.h b/faiss/IndexRaBitQ.h
new file mode 100644
index 0000000000..8d2cb47219
--- /dev/null
+++ b/faiss/IndexRaBitQ.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexFlatCodes.h>
+#include <faiss/impl/RaBitQuantizer.h>
+
+namespace faiss {
+
+struct RaBitQSearchParameters : SearchParameters {
+    uint8_t qb = 0;
+};
+
+struct IndexRaBitQ : IndexFlatCodes {
+    RaBitQuantizer rabitq;
+
+    // center of all points
+    std::vector<float> center;
+
+    // the default number of bits to quantize a query with.
+    // use '0' to disable quantization and use raw fp32 values.
+    uint8_t qb = 0;
+
+    IndexRaBitQ();
+
+    IndexRaBitQ(idx_t d, MetricType metric = METRIC_L2);
+
+    void train(idx_t n, const float* x) override;
+
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    // returns a quantized-to-qb bits DC if qb > 0
+    // returns a default fp32-based DC if qb == 0
+    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
+
+    // returns a quantized-to-qb bits DC if qb_in > 0
+    // returns a default fp32-based DC if qb_in == 0
+    FlatCodesDistanceComputer* get_quantized_distance_computer(
+            const uint8_t qb_in) const;
+
+    // Don't rely on sa_decode(), bcz it is good for IP, but not for L2.
+    //   As a result, use get_FlatCodesDistanceComputer() for the search.
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result,
+            const SearchParameters* params = nullptr) const override;
+};
+
+} // namespace faiss
diff --git a/faiss/IndexScalarQuantizer.cpp b/faiss/IndexScalarQuantizer.cpp
index 8c013d0287..d465ce30f5 100644
--- a/faiss/IndexScalarQuantizer.cpp
+++ b/faiss/IndexScalarQuantizer.cpp
@@ -258,7 +258,8 @@ void IndexIVFScalarQuantizer::add_core(
 
 InvertedListScanner* IndexIVFScalarQuantizer::get_InvertedListScanner(
         bool store_pairs,
-        const IDSelector* sel) const {
+        const IDSelector* sel,
+        const IVFSearchParameters*) const {
     return sq.select_InvertedListScanner(
             metric_type, quantizer, store_pairs, sel, by_residual);
 }
diff --git a/faiss/IndexScalarQuantizer.h b/faiss/IndexScalarQuantizer.h
index fe73536f6a..4617c1b0ce 100644
--- a/faiss/IndexScalarQuantizer.h
+++ b/faiss/IndexScalarQuantizer.h
@@ -96,7 +96,8 @@ struct IndexIVFScalarQuantizer : IndexIVF {
 
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
-            const IDSelector* sel) const override;
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
 
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
diff --git a/faiss/clone_index.cpp b/faiss/clone_index.cpp
index 7174cd6ae0..5a1e5cfad2 100644
--- a/faiss/clone_index.cpp
+++ b/faiss/clone_index.cpp
@@ -19,6 +19,8 @@
 #include <faiss/IndexAdditiveQuantizerFastScan.h>
 #include <faiss/IndexBinary.h>
 #include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
@@ -107,6 +109,11 @@ IndexIVF* Cloner::clone_IndexIVF(const IndexIVF* ivf) {
     return nullptr;
 }
 
+IndexBinaryIVF* clone_IndexBinaryIVF(const IndexBinaryIVF* ivf) {
+    TRYCLONE(IndexBinaryIVF, ivf)
+    return nullptr;
+}
+
 IndexRefine* clone_IndexRefine(const IndexRefine* ir) {
     TRYCLONE(IndexRefineFlat, ir)
     TRYCLONE(IndexRefine, ir) {
@@ -131,6 +138,11 @@ IndexHNSW* clone_IndexHNSW(const IndexHNSW* ihnsw) {
     }
 }
 
+IndexBinaryHNSW* clone_IndexBinaryHNSW(const IndexBinaryHNSW* ihnsw) {
+    TRYCLONE(IndexBinaryHNSW, ihnsw)
+    return nullptr;
+}
+
 IndexNNDescent* clone_IndexNNDescent(const IndexNNDescent* innd) {
     TRYCLONE(IndexNNDescentFlat, innd)
     TRYCLONE(IndexNNDescent, innd) {
@@ -323,9 +335,10 @@ Index* Cloner::clone_Index(const Index* index) {
         IndexNSG* res = clone_IndexNSG(insg);
 
         // copy the dynamic allocated graph
-        auto& new_graph = res->nsg.final_graph;
-        auto& old_graph = insg->nsg.final_graph;
-        new_graph = std::make_shared<nsg::Graph<int>>(*old_graph);
+        if (auto& old_graph = insg->nsg.final_graph) {
+            auto& new_graph = res->nsg.final_graph;
+            new_graph = std::make_shared<nsg::Graph<int>>(*old_graph);
+        }
 
         res->own_fields = true;
         res->storage = clone_Index(insg->storage);
@@ -385,6 +398,28 @@ Quantizer* clone_Quantizer(const Quantizer* quant) {
 IndexBinary* clone_binary_index(const IndexBinary* index) {
     if (auto ii = dynamic_cast<const IndexBinaryFlat*>(index)) {
         return new IndexBinaryFlat(*ii);
+    } else if (
+            const IndexBinaryIVF* ivf =
+                    dynamic_cast<const IndexBinaryIVF*>(index)) {
+        IndexBinaryIVF* res = clone_IndexBinaryIVF(ivf);
+        if (ivf->invlists == nullptr) {
+            res->invlists = nullptr;
+        } else {
+            res->invlists = clone_InvertedLists(ivf->invlists);
+            res->own_invlists = true;
+        }
+
+        res->own_fields = true;
+        res->quantizer = clone_binary_index(ivf->quantizer);
+
+        return res;
+    } else if (
+            const IndexBinaryHNSW* ihnsw =
+                    dynamic_cast<const IndexBinaryHNSW*>(index)) {
+        IndexBinaryHNSW* res = clone_IndexBinaryHNSW(ihnsw);
+        res->own_fields = true;
+        res->storage = clone_binary_index(ihnsw->storage);
+        return res;
     } else {
         FAISS_THROW_MSG("cannot clone this type of index");
     }
diff --git a/faiss/cppcontrib/factory_tools.cpp b/faiss/cppcontrib/factory_tools.cpp
index d1f283b8ff..46ffada3e8 100644
--- a/faiss/cppcontrib/factory_tools.cpp
+++ b/faiss/cppcontrib/factory_tools.cpp
@@ -8,8 +8,22 @@
 // -*- c++ -*-
 
 #include <faiss/cppcontrib/factory_tools.h>
+
 #include <map>
 
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/IndexIDMap.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexNSG.h>
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexRefine.h>
+
 namespace faiss {
 
 namespace {
@@ -122,6 +136,11 @@ std::string reverse_index_factory(const faiss::Index* index) {
             const faiss::IndexHNSW* hnsw_index =
                     dynamic_cast<const faiss::IndexHNSW*>(index)) {
         return "HNSW" + std::to_string(get_hnsw_M(hnsw_index));
+    } else if (
+            const faiss::IndexNSG* nsg_index =
+                    dynamic_cast<const faiss::IndexNSG*>(index)) {
+        return "NSG" + std::to_string(nsg_index->nsg.R) + "," +
+                reverse_index_factory(nsg_index->storage);
     } else if (
             const faiss::IndexRefine* refine_index =
                     dynamic_cast<const faiss::IndexRefine*>(index)) {
diff --git a/faiss/cppcontrib/factory_tools.h b/faiss/cppcontrib/factory_tools.h
index f83a6db4ad..20b9237254 100644
--- a/faiss/cppcontrib/factory_tools.h
+++ b/faiss/cppcontrib/factory_tools.h
@@ -9,20 +9,13 @@
 
 #pragma once
 
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/IndexIDMap.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQFastScan.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQFastScan.h>
-#include <faiss/IndexPreTransform.h>
-#include <faiss/IndexRefine.h>
+#include <string>
 
 namespace faiss {
 
+struct Index;
+struct IndexBinary;
+
 std::string reverse_index_factory(const faiss::Index* index);
 std::string reverse_index_factory(const faiss::IndexBinary* index);
 
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 04d28907d1..0051f047f4 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -258,12 +258,12 @@ if(FAISS_ENABLE_CUVS)
           utils/CuvsUtils.cu)
 endif()
 
-add_library(faiss_gpu STATIC ${FAISS_GPU_SRC})
-set_target_properties(faiss_gpu PROPERTIES
+add_library(faiss_gpu_objs OBJECT ${FAISS_GPU_SRC})
+set_target_properties(faiss_gpu_objs PROPERTIES
   POSITION_INDEPENDENT_CODE ON
   WINDOWS_EXPORT_ALL_SYMBOLS ON
 )
-target_include_directories(faiss_gpu PUBLIC
+target_include_directories(faiss_gpu_objs PUBLIC
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
 
 if(FAISS_ENABLE_CUVS)
@@ -298,7 +298,7 @@ if(FAISS_ENABLE_CUVS)
     utils/CuvsUtils.cu
     TARGET_DIRECTORY faiss
     PROPERTIES COMPILE_OPTIONS "-fvisibility=hidden")
-  target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_CUVS=1)
+  target_compile_definitions(faiss_gpu_objs PUBLIC USE_NVIDIA_CUVS=1)
 endif()
 
 if (FAISS_ENABLE_ROCM)
@@ -308,11 +308,13 @@ endif()
 # Export FAISS_GPU_HEADERS variable to parent scope.
 set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
 
-target_link_libraries(faiss PRIVATE  "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
-target_link_libraries(faiss_avx2 PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
-target_link_libraries(faiss_avx512 PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
-target_link_libraries(faiss_avx512_spr PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
-target_link_libraries(faiss_sve PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
+target_link_libraries(faiss PRIVATE  faiss_gpu_objs)
+target_link_libraries(faiss_avx2 PRIVATE faiss_gpu_objs)
+target_link_libraries(faiss_avx512 PRIVATE faiss_gpu_objs)
+target_link_libraries(faiss_avx512_spr PRIVATE faiss_gpu_objs)
+target_link_libraries(faiss_sve PRIVATE faiss_gpu_objs)
+
+install(TARGETS faiss_gpu_objs EXPORT faiss-targets)
 
 foreach(header ${FAISS_GPU_HEADERS})
   get_filename_component(dir ${header} DIRECTORY )
@@ -322,8 +324,8 @@ foreach(header ${FAISS_GPU_HEADERS})
 endforeach()
 
 if (FAISS_ENABLE_ROCM)
-  target_link_libraries(faiss_gpu PRIVATE hip::host roc::hipblas)
-  target_compile_options(faiss_gpu PRIVATE)
+  target_link_libraries(faiss_gpu_objs PRIVATE hip::host roc::hipblas)
+  target_compile_options(faiss_gpu_objs PRIVATE)
 else()
   # Prepares a host linker script and enables host linker to support
   # very large device object files.
@@ -338,12 +340,12 @@ else()
   }
   ]=]
   )
-  target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+  target_link_options(faiss_gpu_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
-  target_compile_options(faiss_gpu PRIVATE
+  target_link_libraries(faiss_gpu_objs PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+  target_compile_options(faiss_gpu_objs PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all
     --expt-extended-lambda --expt-relaxed-constexpr
     $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
diff --git a/faiss/gpu/GpuAutoTune.cpp b/faiss/gpu/GpuAutoTune.cpp
index fed0132d79..7c1ccfc61d 100644
--- a/faiss/gpu/GpuAutoTune.cpp
+++ b/faiss/gpu/GpuAutoTune.cpp
@@ -28,7 +28,8 @@ using namespace ::faiss;
  * Parameters to auto-tune on GpuIndex'es
  **********************************************************/
 
-#define DC(classname) auto ix = dynamic_cast<const classname*>(index)
+#define DC(classname) \
+    [[maybe_unused]] auto ix = dynamic_cast<const classname*>(index)
 
 void GpuParameterSpace::initialize(const Index* index) {
     if (DC(IndexPreTransform)) {
diff --git a/faiss/gpu/GpuIcmEncoder.cu b/faiss/gpu/GpuIcmEncoder.cu
index 999dd998a0..84202aeb7a 100644
--- a/faiss/gpu/GpuIcmEncoder.cu
+++ b/faiss/gpu/GpuIcmEncoder.cu
@@ -96,7 +96,7 @@ void GpuIcmEncoder::encode(
     auto fn = [=](int idx, IcmEncoderImpl* encoder) {
         size_t i0 = idx * base_shard_size + std::min(size_t(idx), n % nshards);
         size_t ni = base_shard_size;
-        if (ni < n % nshards) {
+        if (idx < n % nshards) {
             ++ni;
         }
         if (ni <= 0) { // only if n < nshards
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index fe0c82b8aa..42a6092ddd 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -103,6 +103,10 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
     this->ntotal = n;
 }
 
+void GpuIndexCagra::add(idx_t n, const float* x) {
+    train(n, x);
+}
+
 bool GpuIndexCagra::addImplRequiresIDs_() const {
     return false;
 };
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index d6fae29b58..a0cac805a7 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -173,8 +173,8 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
     /// Number of Iterations to run if building with NN_DESCENT
     size_t nn_descent_niter = 20;
 
-    IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
-    IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
+    std::shared_ptr<IVFPQBuildCagraConfig> ivf_pq_params{nullptr};
+    std::shared_ptr<IVFPQSearchCagraConfig> ivf_pq_search_params{nullptr};
     float refine_rate = 2.0f;
     bool store_dataset = true;
 };
@@ -245,7 +245,17 @@ struct GpuIndexCagra : public GpuIndex {
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexCagraConfig config = GpuIndexCagraConfig());
 
-    /// Trains CAGRA based on the given vector data
+    /// Trains CAGRA based on the given vector data and add them along with ids.
+    /// NB: The use of the add function here is to build the CAGRA graph on
+    /// the base dataset. Use this function when you want to add vectors with
+    /// ids. Ref: https://github.com/facebookresearch/faiss/issues/4107
+    void add(idx_t n, const float* x) override;
+
+    /// Trains CAGRA based on the given vector data.
+    /// NB: The use of the train function here is to build the CAGRA graph on
+    /// the base dataset and is currently the only function to add the full set
+    /// of vectors (without IDs) to the index. There is no external quantizer to
+    /// be trained here.
     void train(idx_t n, const float* x) override;
 
     /// Initialize ourselves from the given CPU index; will overwrite
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index da0e5ac8f3..f6e314e9b6 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -566,17 +566,13 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
                     "Bits per code must be 8 (passed %d)",
                     bitsPerCode_);
         }
-    }
-
-    // The number of bytes per encoded vector must be one we support
-    FAISS_THROW_IF_NOT_FMT(
-            ivfpqConfig_.interleavedLayout ||
-                    IVFPQ::isSupportedPQCodeLength(subQuantizers_),
-            "Number of bytes per encoded vector / sub-quantizers (%d) "
-            "is not supported",
-            subQuantizers_);
-
-    if (!should_use_cuvs(config_)) {
+        // The number of bytes per encoded vector must be one we support
+        FAISS_THROW_IF_NOT_FMT(
+                ivfpqConfig_.interleavedLayout ||
+                        IVFPQ::isSupportedPQCodeLength(subQuantizers_),
+                "Number of bytes per encoded vector / sub-quantizers (%d) "
+                "is not supported",
+                subQuantizers_);
         // Sub-quantizers must evenly divide dimensions available
         FAISS_THROW_IF_NOT_FMT(
                 this->d % subQuantizers_ == 0,
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 39ee38efa9..649b7cb5cf 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -411,7 +411,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     raftHandles_.emplace(std::make_pair(device, defaultStream));
 #endif
 
-    cudaStream_t asyncCopyStream = 0;
+    cudaStream_t asyncCopyStream = nullptr;
     CUDA_VERIFY(
             cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
 
diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index 82e3007d59..f60e1e3ab5 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -69,6 +69,7 @@ CuvsCagra::CuvsCagra(
 
     index_params_.intermediate_graph_degree = intermediate_graph_degree;
     index_params_.graph_degree = graph_degree;
+    index_params_.attach_dataset_on_build = store_dataset;
 
     if (!ivf_pq_search_params_) {
         ivf_pq_search_params_ =
@@ -243,6 +244,7 @@ void CuvsCagra::search(
                     storage_, n_, dim_);
             cuvs_index->update_dataset(raft_handle, dataset);
         }
+        store_dataset_ = true;
     }
 
     auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
diff --git a/faiss/gpu/impl/CuvsCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh
index c466aceec4..8e458d8be2 100644
--- a/faiss/gpu/impl/CuvsCagra.cuh
+++ b/faiss/gpu/impl/CuvsCagra.cuh
@@ -118,8 +118,11 @@ class CuvsCagra {
     const int dim_;
 
     /// Controls the underlying cuVS index if it should store the dataset in
-    /// device memory
-    bool store_dataset_;
+    /// device memory. Default set to true for enabling search capabilities on
+    /// the index.
+    /// NB: This is also required to be set to true for deserializing
+    /// an IndexHNSWCagra object.
+    bool store_dataset_ = true;
 
     /// Metric type of the index
     faiss::MetricType metric_;
diff --git a/faiss/gpu/impl/DistanceUtils.cuh b/faiss/gpu/impl/DistanceUtils.cuh
index fd894ae3bd..4ea899c8ec 100644
--- a/faiss/gpu/impl/DistanceUtils.cuh
+++ b/faiss/gpu/impl/DistanceUtils.cuh
@@ -303,7 +303,7 @@ __global__ void incrementIndex(
         int k,
         idx_t increment) {
     for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {
-        for (int j = threadIdx.x; j < k; j += blockDim.x) {
+        for (auto j = threadIdx.x; j < k; j += blockDim.x) {
             indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;
         }
     }
diff --git a/faiss/gpu/impl/GpuScalarQuantizer.cuh b/faiss/gpu/impl/GpuScalarQuantizer.cuh
index c2d781419d..186ecac1c2 100644
--- a/faiss/gpu/impl/GpuScalarQuantizer.cuh
+++ b/faiss/gpu/impl/GpuScalarQuantizer.cuh
@@ -377,7 +377,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -587,7 +587,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -753,7 +753,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
diff --git a/faiss/gpu/impl/IVFAppend.cu b/faiss/gpu/impl/IVFAppend.cu
index ba5cedf3c7..dd1c9073b5 100644
--- a/faiss/gpu/impl/IVFAppend.cu
+++ b/faiss/gpu/impl/IVFAppend.cu
@@ -368,9 +368,9 @@ __global__ void ivfInterleavedAppend(
         // The set of addresses for each of the lists
         void** listData) {
     // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    int laneId = threadIdx.x % kWarpSize;
-    int warpId = threadIdx.x / kWarpSize;
-    int warpsPerBlock = blockDim.x / kWarpSize;
+    auto laneId = threadIdx.x % kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
+    auto warpsPerBlock = blockDim.x / kWarpSize;
 
     // Each block is dedicated to a separate list
     idx_t listId = uniqueLists[blockIdx.x];
diff --git a/faiss/gpu/impl/IVFFlatScan.cu b/faiss/gpu/impl/IVFFlatScan.cu
index 457d0afeb6..5c6307b032 100644
--- a/faiss/gpu/impl/IVFFlatScan.cu
+++ b/faiss/gpu/impl/IVFFlatScan.cu
@@ -65,9 +65,9 @@ struct IVFFlatScan {
         int limit = utils::divDown(dim, Codec::kDimPerIter);
 
         // Each warp handles a separate chunk of vectors
-        int warpId = threadIdx.x / kWarpSize;
+        auto warpId = threadIdx.x / kWarpSize;
         // FIXME: why does getLaneId() not work when we write out below!?!?!
-        int laneId = threadIdx.x % kWarpSize; // getLaneId();
+        auto laneId = threadIdx.x % kWarpSize; // getLaneId();
 
         // Divide the set of vectors among the warps
         idx_t vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
diff --git a/faiss/gpu/impl/IVFInterleaved.cu b/faiss/gpu/impl/IVFInterleaved.cu
index e5b13f3aa8..fc99a49163 100644
--- a/faiss/gpu/impl/IVFInterleaved.cu
+++ b/faiss/gpu/impl/IVFInterleaved.cu
@@ -27,7 +27,7 @@ __global__ void ivfInterleavedScan2(
         Tensor<float, 2, true> distanceOut,
         Tensor<idx_t, 2, true> indicesOut) {
     if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        int queryId = blockIdx.x;
+        auto queryId = blockIdx.x;
 
         constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
@@ -99,7 +99,7 @@ __global__ void ivfInterleavedScan2(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             // Re-adjust the value we are selecting based on the sorting order
             distanceOut[queryId][i] = smemK[i] * adj;
             auto packedIndex = smemV[i];
diff --git a/faiss/gpu/impl/IVFInterleaved.cuh b/faiss/gpu/impl/IVFInterleaved.cuh
index f1da8342d4..1b7fbbe7a1 100644
--- a/faiss/gpu/impl/IVFInterleaved.cuh
+++ b/faiss/gpu/impl/IVFInterleaved.cuh
@@ -56,7 +56,7 @@ __global__ void ivfInterleavedScan(
 
         for (idx_t queryId = blockIdx.y; queryId < queries.getSize(0);
              queryId += gridDim.y) {
-            int probeId = blockIdx.x;
+            auto probeId = blockIdx.x;
             idx_t listId = listIds[queryId][probeId];
 
             // Safety guard in case NaNs in input cause no list ID to be
@@ -69,8 +69,8 @@ __global__ void ivfInterleavedScan(
             int dim = queries.getSize(1);
 
             // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-            int laneId = threadIdx.x % kWarpSize;
-            int warpId = threadIdx.x / kWarpSize;
+            auto laneId = threadIdx.x % kWarpSize;
+            auto warpId = threadIdx.x / kWarpSize;
 
             using EncodeT = typename Codec::EncodeT;
 
@@ -215,7 +215,7 @@ __global__ void ivfInterleavedScan(
             auto distanceOutBase = distanceOut[queryId][probeId].data();
             auto indicesOutBase = indicesOut[queryId][probeId].data();
 
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 distanceOutBase[i] = smemK[i];
                 indicesOutBase[i] = smemV[i];
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect1.cu b/faiss/gpu/impl/IVFUtilsSelect1.cu
index 3cb88bd9c7..c4f65bab8f 100644
--- a/faiss/gpu/impl/IVFUtilsSelect1.cu
+++ b/faiss/gpu/impl/IVFUtilsSelect1.cu
@@ -90,7 +90,7 @@ __global__ void pass1SelectLists(
 
             // Write out the final k-selected values; they should be all
             // together
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 heapDistances[queryId][sliceId][i] = smemK[i];
                 heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect2.cu b/faiss/gpu/impl/IVFUtilsSelect2.cu
index 3a94101bdb..2dbf3c0f00 100644
--- a/faiss/gpu/impl/IVFUtilsSelect2.cu
+++ b/faiss/gpu/impl/IVFUtilsSelect2.cu
@@ -100,7 +100,7 @@ __global__ void pass2SelectLists(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             outDistances[queryId][i] = smemK[i];
 
             // `v` is the index in `heapIndices`
diff --git a/faiss/gpu/impl/IcmEncoder.cu b/faiss/gpu/impl/IcmEncoder.cu
index 20ff36e2d2..b86e390f85 100644
--- a/faiss/gpu/impl/IcmEncoder.cu
+++ b/faiss/gpu/impl/IcmEncoder.cu
@@ -46,8 +46,8 @@ __global__ void runIcmEncodeStep(
         int m) {
     using KVPair = Pair<float, int>;
 
-    int id = blockIdx.x;    // each block takes care of one vector
-    int code = threadIdx.x; // each thread takes care of one possible code
+    auto id = blockIdx.x;    // each block takes care of one vector
+    auto code = threadIdx.x; // each thread takes care of one possible code
 
     // compute the objective value by look-up tables
     KVPair obj(0.0f, code);
@@ -94,8 +94,8 @@ __global__ void runEvaluation(
         int M,
         int K,
         int dims) {
-    int id = blockIdx.x; // each block takes care of one vector
-    int d = threadIdx.x; // each thread takes care of one dimension
+    auto id = blockIdx.x; // each block takes care of one vector
+    auto d = threadIdx.x; // each thread takes care of one dimension
     float acc = 0.0f;
 
 #pragma unroll
@@ -136,7 +136,7 @@ __global__ void runCodesPerturbation(
         int K,
         int nperts) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n) {
         return;
@@ -173,7 +173,7 @@ __global__ void runCodesSelection(
         int n,
         int M) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n || objs[id] >= bestObjs[id]) {
         return;
@@ -195,8 +195,8 @@ __global__ void runCodesSelection(
  * @param K     number of codewords in a codebook
  */
 __global__ void runNormAddition(float* uterm, const float* norm, int K) {
-    int id = blockIdx.x;
-    int code = threadIdx.x;
+    auto id = blockIdx.x;
+    auto code = threadIdx.x;
 
     uterm[id * K + code] += norm[code];
 }
diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu
index e76a0831ff..0e65015e44 100644
--- a/faiss/gpu/impl/L2Norm.cu
+++ b/faiss/gpu/impl/L2Norm.cu
@@ -40,7 +40,7 @@ __global__ void l2NormRowMajor(
     // these are fine to be int (just based on block dimensions)
     int numWarps = utils::divUp(blockDim.x, kWarpSize);
     int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
 
     bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
     idx_t rowStart = idx_t(blockIdx.x) * RowTileSize;
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index c549af3947..6b86695976 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -43,6 +43,7 @@ faiss_gpu_test(TestGpuIndexFlat.cpp)
 faiss_gpu_test(TestGpuIndexIVFFlat.cpp)
 faiss_gpu_test(TestGpuIndexBinaryFlat.cpp)
 faiss_gpu_test(TestGpuMemoryException.cpp)
+faiss_gpu_test(TestGpuIcmEncoder.cpp)
 faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
 faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuResidualQuantizer.cpp)
diff --git a/faiss/gpu/test/TestGpuIcmEncoder.cpp b/faiss/gpu/test/TestGpuIcmEncoder.cpp
new file mode 100644
index 0000000000..0c793d4f58
--- /dev/null
+++ b/faiss/gpu/test/TestGpuIcmEncoder.cpp
@@ -0,0 +1,112 @@
+#include <faiss/gpu/GpuIcmEncoder.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+
+#include <gtest/gtest.h>
+#include <tuple>
+#include <vector>
+
+using faiss::LocalSearchQuantizer;
+using faiss::gpu::GpuIcmEncoder;
+using faiss::gpu::GpuResourcesProvider;
+using faiss::gpu::StandardGpuResources;
+
+struct ShardingTestParams {
+    size_t n;
+    size_t nshards;
+};
+
+class GpuIcmEncoderShardingTest
+        : public ::testing::TestWithParam<ShardingTestParams> {
+   protected:
+    void SetUp() override {
+        params = GetParam();
+
+        lsq.M = 4;
+        lsq.K = 16;
+        lsq.d = 32;
+
+        std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+        lsq.codebooks.resize(lsq.M * lsq.K * lsq.d);
+        for (auto& v : lsq.codebooks) {
+            v = dist(gen);
+        }
+
+        x.resize(params.n * lsq.d);
+        codes.resize(params.n * lsq.M);
+
+        for (auto& v : x) {
+            v = dist(gen);
+        }
+
+        std::uniform_int_distribution<int32_t> codeDist(0, lsq.K - 1);
+        for (auto& c : codes) {
+            c = codeDist(gen);
+        }
+    }
+
+    LocalSearchQuantizer lsq;
+    std::vector<float> x;
+    std::vector<int32_t> codes;
+    std::mt19937 gen;
+    ShardingTestParams params;
+    static constexpr size_t ils_iters = 4;
+};
+
+TEST_P(GpuIcmEncoderShardingTest, DataShardingCorrectness) {
+    std::vector<StandardGpuResources> resources(params.nshards);
+    std::vector<GpuResourcesProvider*> provs;
+    std::vector<int> devices;
+
+    for (size_t i = 0; i < params.nshards; ++i) {
+        resources[i].noTempMemory();
+        provs.push_back(&resources[i]);
+        devices.push_back(0); // use GPU 0 for testing all shards
+    }
+
+    GpuIcmEncoder encoder(&lsq, provs, devices);
+    encoder.set_binary_term();
+
+    gen.seed(42);
+    EXPECT_NO_THROW(
+            encoder.encode(codes.data(), x.data(), gen, params.n, ils_iters));
+
+    for (auto c : codes) {
+        EXPECT_GE(c, 0);
+        EXPECT_LT(c, lsq.K);
+    }
+}
+
+std::vector<ShardingTestParams> GetShardingTestCases() {
+    return {
+            {1, 8},
+
+            {5, 4},
+
+            {10, 2},
+            {10, 3},
+            {10, 5},
+            {10, 8},
+
+            {20, 8},
+    };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+        MultiGpuShardingTests,
+        GpuIcmEncoderShardingTest,
+        ::testing::ValuesIn(GetShardingTestCases()),
+        [](const ::testing::TestParamInfo<ShardingTestParams>& info) {
+            return "n" + std::to_string(info.param.n) + "_shards" +
+                    std::to_string(info.param.nshards);
+        });
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}
diff --git a/faiss/gpu/utils/DeviceUtils.cu b/faiss/gpu/utils/DeviceUtils.cu
index 15036d39fb..123c8e05e8 100644
--- a/faiss/gpu/utils/DeviceUtils.cu
+++ b/faiss/gpu/utils/DeviceUtils.cu
@@ -30,7 +30,7 @@ void setCurrentDevice(int device) {
 int getNumDevices() {
     int numDev = -1;
     cudaError_t err = cudaGetDeviceCount(&numDev);
-    if (cudaErrorNoDevice == err) {
+    if (cudaErrorNoDevice == err || cudaErrorInsufficientDriver == err) {
         numDev = 0;
     } else {
         CUDA_VERIFY(err);
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index 10bd813435..2dad791cec 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -8,7 +8,6 @@
 #include <faiss/impl/HNSW.h>
 
 #include <cstddef>
-#include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/DistanceComputer.h>
@@ -32,6 +31,7 @@ namespace faiss {
  **************************************************************/
 
 int HNSW::nb_neighbors(int layer_no) const {
+    FAISS_THROW_IF_NOT(layer_no + 1 < cum_nneighbor_per_level.size());
     return cum_nneighbor_per_level[layer_no + 1] -
             cum_nneighbor_per_level[layer_no];
 }
@@ -590,15 +590,22 @@ int search_from_candidates(
         HNSWStats& stats,
         int level,
         int nres_in,
-        const SearchParametersHNSW* params) {
+        const SearchParameters* params) {
     int nres = nres_in;
     int ndis = 0;
 
     // can be overridden by search params
-    bool do_dis_check = params ? params->check_relative_distance
-                               : hnsw.check_relative_distance;
-    int efSearch = params ? params->efSearch : hnsw.efSearch;
-    const IDSelector* sel = params ? params->sel : nullptr;
+    bool do_dis_check = hnsw.check_relative_distance;
+    int efSearch = hnsw.efSearch;
+    const IDSelector* sel = nullptr;
+    if (params) {
+        if (const SearchParametersHNSW* hnsw_params =
+                    dynamic_cast<const SearchParametersHNSW*>(params)) {
+            do_dis_check = hnsw_params->check_relative_distance;
+            efSearch = hnsw_params->efSearch;
+        }
+        sel = params->sel;
+    }
 
     C::T threshold = res.threshold;
     for (int i = 0; i < candidates.size(); i++) {
@@ -920,15 +927,22 @@ HNSWStats HNSW::search(
         DistanceComputer& qdis,
         ResultHandler<C>& res,
         VisitedTable& vt,
-        const SearchParametersHNSW* params) const {
+        const SearchParameters* params) const {
     HNSWStats stats;
     if (entry_point == -1) {
         return stats;
     }
     int k = extract_k_from_ResultHandler(res);
 
-    bool bounded_queue =
-            params ? params->bounded_queue : this->search_bounded_queue;
+    bool bounded_queue = this->search_bounded_queue;
+    int efSearch = this->efSearch;
+    if (params) {
+        if (const SearchParametersHNSW* hnsw_params =
+                    dynamic_cast<const SearchParametersHNSW*>(params)) {
+            bounded_queue = hnsw_params->bounded_queue;
+            efSearch = hnsw_params->efSearch;
+        }
+    }
 
     //  greedy search on upper levels
     storage_idx_t nearest = entry_point;
@@ -940,7 +954,7 @@ HNSWStats HNSW::search(
         stats.combine(local_stats);
     }
 
-    int ef = std::max(params ? params->efSearch : efSearch, k);
+    int ef = std::max(efSearch, k);
     if (bounded_queue) { // this is the most common branch
         MinimaxHeap candidates(ef);
 
@@ -980,9 +994,17 @@ void HNSW::search_level_0(
         int search_type,
         HNSWStats& search_stats,
         VisitedTable& vt,
-        const SearchParametersHNSW* params) const {
+        const SearchParameters* params) const {
     const HNSW& hnsw = *this;
-    auto efSearch = params ? params->efSearch : hnsw.efSearch;
+
+    auto efSearch = hnsw.efSearch;
+    if (params) {
+        if (const SearchParametersHNSW* hnsw_params =
+                    dynamic_cast<const SearchParametersHNSW*>(params)) {
+            efSearch = hnsw_params->efSearch;
+        }
+    }
+
     int k = extract_k_from_ResultHandler(res);
 
     if (search_type == 1) {
@@ -1062,7 +1084,7 @@ void HNSW::permute_entries(const idx_t* map) {
     // swap everyone
     std::swap(levels, new_levels);
     std::swap(offsets, new_offsets);
-    std::swap(neighbors, new_neighbors);
+    neighbors = std::move(new_neighbors);
 }
 
 /**************************************************************
diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h
index f80fefc2e7..c736588229 100644
--- a/faiss/impl/HNSW.h
+++ b/faiss/impl/HNSW.h
@@ -16,6 +16,7 @@
 
 #include <faiss/Index.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/maybe_owned_vector.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/random.h>
@@ -122,7 +123,7 @@ struct HNSW {
 
     /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
     /// for all levels. this is where all storage goes.
-    std::vector<storage_idx_t> neighbors;
+    MaybeOwnedVector<storage_idx_t> neighbors;
 
     /// entry point in the search structure (one of the points with maximum
     /// level
@@ -201,7 +202,7 @@ struct HNSW {
             DistanceComputer& qdis,
             ResultHandler<C>& res,
             VisitedTable& vt,
-            const SearchParametersHNSW* params = nullptr) const;
+            const SearchParameters* params = nullptr) const;
 
     /// search only in level 0 from a given vertex
     void search_level_0(
@@ -213,7 +214,7 @@ struct HNSW {
             int search_type,
             HNSWStats& search_stats,
             VisitedTable& vt,
-            const SearchParametersHNSW* params = nullptr) const;
+            const SearchParameters* params = nullptr) const;
 
     void reset();
 
@@ -265,7 +266,7 @@ int search_from_candidates(
         HNSWStats& stats,
         int level,
         int nres_in = 0,
-        const SearchParametersHNSW* params = nullptr);
+        const SearchParameters* params = nullptr);
 
 HNSWStats greedy_update_nearest(
         const HNSW& hnsw,
diff --git a/faiss/impl/NNDescent.cpp b/faiss/impl/NNDescent.cpp
index 3d707be067..9701142ddd 100644
--- a/faiss/impl/NNDescent.cpp
+++ b/faiss/impl/NNDescent.cpp
@@ -400,7 +400,7 @@ void NNDescent::build(DistanceComputer& qdis, const int n, bool verbose) {
     init_graph(qdis);
     nndescent(qdis, verbose);
 
-    final_graph.resize(ntotal * K);
+    final_graph.resize(uint64_t(ntotal) * K);
 
     // Store the neighbor link structure into final_graph
     // Clear the old graph
diff --git a/faiss/impl/RaBitQuantizer.cpp b/faiss/impl/RaBitQuantizer.cpp
new file mode 100644
index 0000000000..8261a9a86c
--- /dev/null
+++ b/faiss/impl/RaBitQuantizer.cpp
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/RaBitQuantizer.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+
+namespace faiss {
+
+struct FactorsData {
+    // ||or - c||^2 - ((metric==IP) ? ||or||^2 : 0)
+    float or_minus_c_l2sqr = 0;
+    float dp_multiplier = 0;
+};
+
+struct QueryFactorsData {
+    float c1 = 0;
+    float c2 = 0;
+    float c34 = 0;
+
+    float qr_to_c_L2sqr = 0;
+    float qr_norm_L2sqr = 0;
+};
+
+static size_t get_code_size(const size_t d) {
+    return (d + 7) / 8 + sizeof(FactorsData);
+}
+
+RaBitQuantizer::RaBitQuantizer(size_t d, MetricType metric)
+        : Quantizer(d, get_code_size(d)), metric_type{metric} {}
+
+void RaBitQuantizer::train(size_t n, const float* x) {
+    // does nothing
+}
+
+void RaBitQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
+        const {
+    compute_codes_core(x, codes, n, centroid);
+}
+
+void RaBitQuantizer::compute_codes_core(
+        const float* x,
+        uint8_t* codes,
+        size_t n,
+        const float* centroid_in) const {
+    FAISS_ASSERT(codes != nullptr);
+    FAISS_ASSERT(x != nullptr);
+    FAISS_ASSERT(
+            (metric_type == MetricType::METRIC_L2 ||
+             metric_type == MetricType::METRIC_INNER_PRODUCT));
+
+    if (n == 0) {
+        return;
+    }
+
+    // compute some helper constants
+    const float inv_d_sqrt = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
+
+    // compute codes
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        // ||or - c||^2
+        float norm_L2sqr = 0;
+        // ||or||^2, which is equal to ||P(or)||^2 and ||P^(-1)(or)||^2
+        float or_L2sqr = 0;
+        // dot product
+        float dp_oO = 0;
+
+        // the code
+        uint8_t* code = codes + i * code_size;
+        FactorsData* fac = reinterpret_cast<FactorsData*>(code + (d + 7) / 8);
+
+        // cleanup it
+        if (code != nullptr) {
+            memset(code, 0, code_size);
+        }
+
+        for (size_t j = 0; j < d; j++) {
+            const float or_minus_c = x[i * d + j] -
+                    ((centroid_in == nullptr) ? 0 : centroid_in[j]);
+            norm_L2sqr += or_minus_c * or_minus_c;
+            or_L2sqr += x[i * d + j] * x[i * d + j];
+
+            const bool xb = (or_minus_c > 0);
+
+            dp_oO += xb ? or_minus_c : (-or_minus_c);
+
+            // store the output data
+            if (code != nullptr) {
+                if (xb) {
+                    // enable a particular bit
+                    code[j / 8] |= (1 << (j % 8));
+                }
+            }
+        }
+
+        // compute factors
+
+        // compute the inverse norm
+        const float inv_norm_L2 =
+                (std::abs(norm_L2sqr) < std::numeric_limits<float>::epsilon())
+                ? 1.0f
+                : (1.0f / std::sqrt(norm_L2sqr));
+        dp_oO *= inv_norm_L2;
+        dp_oO *= inv_d_sqrt;
+
+        const float inv_dp_oO =
+                (std::abs(dp_oO) < std::numeric_limits<float>::epsilon())
+                ? 1.0f
+                : (1.0f / dp_oO);
+
+        fac->or_minus_c_l2sqr = norm_L2sqr;
+        if (metric_type == MetricType::METRIC_INNER_PRODUCT) {
+            fac->or_minus_c_l2sqr -= or_L2sqr;
+        }
+
+        fac->dp_multiplier = inv_dp_oO * std::sqrt(norm_L2sqr);
+    }
+}
+
+void RaBitQuantizer::decode(const uint8_t* codes, float* x, size_t n) const {
+    decode_core(codes, x, n, centroid);
+}
+
+void RaBitQuantizer::decode_core(
+        const uint8_t* codes,
+        float* x,
+        size_t n,
+        const float* centroid_in) const {
+    FAISS_ASSERT(codes != nullptr);
+    FAISS_ASSERT(x != nullptr);
+
+    const float inv_d_sqrt = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
+
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        const uint8_t* code = codes + i * code_size;
+
+        // split the code into parts
+        const uint8_t* binary_data = code;
+        const FactorsData* fac =
+                reinterpret_cast<const FactorsData*>(code + (d + 7) / 8);
+
+        //
+        for (size_t j = 0; j < d; j++) {
+            // extract i-th bit
+            const uint8_t masker = (1 << (j % 8));
+            const float bit = ((binary_data[j / 8] & masker) == masker) ? 1 : 0;
+
+            // compute the output code
+            x[i * d + j] = (bit - 0.5f) * fac->dp_multiplier * 2 * inv_d_sqrt +
+                    ((centroid_in == nullptr) ? 0 : centroid_in[j]);
+        }
+    }
+}
+
+struct RaBitDistanceComputer : FlatCodesDistanceComputer {
+    // dimensionality
+    size_t d = 0;
+    // a centroid to use
+    const float* centroid = nullptr;
+
+    // the metric
+    MetricType metric_type = MetricType::METRIC_L2;
+
+    RaBitDistanceComputer();
+
+    float symmetric_dis(idx_t i, idx_t j) override;
+};
+
+RaBitDistanceComputer::RaBitDistanceComputer() = default;
+
+float RaBitDistanceComputer::symmetric_dis(idx_t i, idx_t j) {
+    FAISS_THROW_MSG("Not implemented");
+}
+
+struct RaBitDistanceComputerNotQ : RaBitDistanceComputer {
+    // the rotated query (qr - c)
+    std::vector<float> rotated_q;
+    // some additional numbers for the query
+    QueryFactorsData query_fac;
+
+    RaBitDistanceComputerNotQ();
+
+    float distance_to_code(const uint8_t* code) override;
+
+    void set_query(const float* x) override;
+};
+
+RaBitDistanceComputerNotQ::RaBitDistanceComputerNotQ() = default;
+
+float RaBitDistanceComputerNotQ::distance_to_code(const uint8_t* code) {
+    FAISS_ASSERT(code != nullptr);
+    FAISS_ASSERT(
+            (metric_type == MetricType::METRIC_L2 ||
+             metric_type == MetricType::METRIC_INNER_PRODUCT));
+
+    // split the code into parts
+    const uint8_t* binary_data = code;
+    const FactorsData* fac =
+            reinterpret_cast<const FactorsData*>(code + (d + 7) / 8);
+
+    // this is the baseline code
+    //
+    // compute <q,o> using floats
+    float dot_qo = 0;
+    // It was a willful decision (after the discussion) to not to pre-cache
+    //   the sum of all bits, just in order to reduce the overhead per vector.
+    uint64_t sum_q = 0;
+    for (size_t i = 0; i < d; i++) {
+        // extract i-th bit
+        const uint8_t masker = (1 << (i % 8));
+        const bool b_bit = ((binary_data[i / 8] & masker) == masker);
+
+        // accumulate dp
+        dot_qo += (b_bit) ? rotated_q[i] : 0;
+        // accumulate sum-of-bits
+        sum_q += (b_bit) ? 1 : 0;
+    }
+
+    float final_dot = 0;
+    // dot-product itself
+    final_dot += query_fac.c1 * dot_qo;
+    // normalizer coefficients
+    final_dot += query_fac.c2 * sum_q;
+    // normalizer coefficients
+    final_dot -= query_fac.c34;
+
+    // this is ||or - c||^2 - (IP ? ||or||^2 : 0)
+    const float or_c_l2sqr = fac->or_minus_c_l2sqr;
+
+    // pre_dist = ||or - c||^2 + ||qr - c||^2 -
+    //     2 * ||or - c|| * ||qr - c|| * <q,o> - (IP ? ||or||^2 : 0)
+    const float pre_dist = or_c_l2sqr + query_fac.qr_to_c_L2sqr -
+            2 * fac->dp_multiplier * final_dot;
+
+    if (metric_type == MetricType::METRIC_L2) {
+        // ||or - q||^ 2
+        return pre_dist;
+    } else {
+        // metric == MetricType::METRIC_INNER_PRODUCT
+
+        // this is ||q||^2
+        const float query_norm_sqr = query_fac.qr_norm_L2sqr;
+
+        // 2 * (or, q) = (||or - q||^2 - ||q||^2 - ||or||^2)
+        return -0.5f * (pre_dist - query_norm_sqr);
+    }
+}
+
+void RaBitDistanceComputerNotQ::set_query(const float* x) {
+    FAISS_ASSERT(x != nullptr);
+    FAISS_ASSERT(
+            (metric_type == MetricType::METRIC_L2 ||
+             metric_type == MetricType::METRIC_INNER_PRODUCT));
+
+    // compute the distance from the query to the centroid
+    if (centroid != nullptr) {
+        query_fac.qr_to_c_L2sqr = fvec_L2sqr(x, centroid, d);
+    } else {
+        query_fac.qr_to_c_L2sqr = fvec_norm_L2sqr(x, d);
+    }
+
+    // subtract c, obtain P^(-1)(qr - c)
+    rotated_q.resize(d);
+    for (size_t i = 0; i < d; i++) {
+        rotated_q[i] = x[i] - ((centroid == nullptr) ? 0 : centroid[i]);
+    }
+
+    // compute some numbers
+    const float inv_d = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
+
+    // do not quantize the query
+    float sum_q = 0;
+    for (size_t i = 0; i < d; i++) {
+        sum_q += rotated_q[i];
+    }
+
+    query_fac.c1 = 2 * inv_d;
+    query_fac.c2 = 0;
+    query_fac.c34 = sum_q * inv_d;
+
+    if (metric_type == MetricType::METRIC_INNER_PRODUCT) {
+        // precompute if needed
+        query_fac.qr_norm_L2sqr = fvec_norm_L2sqr(x, d);
+    }
+}
+
+//
+struct RaBitDistanceComputerQ : RaBitDistanceComputer {
+    // the rotated and quantized query (qr - c)
+    std::vector<uint8_t> rotated_qq;
+    // we're using the proposed relayout-ed scheme from 3.3 that allows
+    //    using popcounts for computing the distance.
+    std::vector<uint8_t> rearranged_rotated_qq;
+    // some additional numbers for the query
+    QueryFactorsData query_fac;
+
+    // the number of bits for SQ quantization of the query (qb > 0)
+    uint8_t qb = 8;
+    // the smallest value divisible by 8 that is not smaller than dim
+    size_t popcount_aligned_dim = 0;
+
+    RaBitDistanceComputerQ();
+
+    float distance_to_code(const uint8_t* code) override;
+
+    void set_query(const float* x) override;
+};
+
+RaBitDistanceComputerQ::RaBitDistanceComputerQ() = default;
+
+float RaBitDistanceComputerQ::distance_to_code(const uint8_t* code) {
+    FAISS_ASSERT(code != nullptr);
+    FAISS_ASSERT(
+            (metric_type == MetricType::METRIC_L2 ||
+             metric_type == MetricType::METRIC_INNER_PRODUCT));
+
+    // split the code into parts
+    const uint8_t* binary_data = code;
+    const FactorsData* fac =
+            reinterpret_cast<const FactorsData*>(code + (d + 7) / 8);
+
+    // // this is the baseline code
+    // //
+    // // compute <q,o> using integers
+    // size_t dot_qo = 0;
+    // for (size_t i = 0; i < d; i++) {
+    //     // extract i-th bit
+    //     const uint8_t masker = (1 << (i % 8));
+    //     const uint8_t bit = ((binary_data[i / 8] & masker) == masker) ? 1 :
+    //     0;
+    //
+    //     // accumulate dp
+    //     dot_qo += bit * rotated_qq[i];
+    // }
+
+    // this is the scheme for popcount
+    const size_t di_8b = (d + 7) / 8;
+    const size_t di_64b = (di_8b / 8) * 8;
+
+    uint64_t dot_qo = 0;
+    for (size_t j = 0; j < qb; j++) {
+        const uint8_t* query_j = rearranged_rotated_qq.data() + j * di_8b;
+
+        // process 64-bit popcounts
+        uint64_t count_dot = 0;
+        for (size_t i = 0; i < di_64b; i += 8) {
+            const auto qv = *(const uint64_t*)(query_j + i);
+            const auto yv = *(const uint64_t*)(binary_data + i);
+            count_dot += __builtin_popcountll(qv & yv);
+        }
+
+        // process leftovers
+        for (size_t i = di_64b; i < di_8b; i++) {
+            const auto qv = *(query_j + i);
+            const auto yv = *(binary_data + i);
+            count_dot += __builtin_popcount(qv & yv);
+        }
+
+        dot_qo += (count_dot << j);
+    }
+
+    // It was a willful decision (after the discussion) to not to pre-cache
+    //   the sum of all bits, just in order to reduce the overhead per vector.
+    uint64_t sum_q = 0;
+    {
+        // process 64-bit popcounts
+        for (size_t i = 0; i < di_64b; i += 8) {
+            const auto yv = *(const uint64_t*)(binary_data + i);
+            sum_q += __builtin_popcountll(yv);
+        }
+
+        // process leftovers
+        for (size_t i = di_64b; i < di_8b; i++) {
+            const auto yv = *(binary_data + i);
+            sum_q += __builtin_popcount(yv);
+        }
+    }
+
+    float final_dot = 0;
+    // dot-product itself
+    final_dot += query_fac.c1 * dot_qo;
+    // normalizer coefficients
+    final_dot += query_fac.c2 * sum_q;
+    // normalizer coefficients
+    final_dot -= query_fac.c34;
+
+    // this is ||or - c||^2 - (IP ? ||or||^2 : 0)
+    const float or_c_l2sqr = fac->or_minus_c_l2sqr;
+
+    // pre_dist = ||or - c||^2 + ||qr - c||^2 -
+    //     2 * ||or - c|| * ||qr - c|| * <q,o> - (IP ? ||or||^2 : 0)
+    const float pre_dist = or_c_l2sqr + query_fac.qr_to_c_L2sqr -
+            2 * fac->dp_multiplier * final_dot;
+
+    if (metric_type == MetricType::METRIC_L2) {
+        // ||or - q||^ 2
+        return pre_dist;
+    } else {
+        // metric == MetricType::METRIC_INNER_PRODUCT
+
+        // this is ||q||^2
+        const float query_norm_sqr = query_fac.qr_norm_L2sqr;
+
+        // 2 * (or, q) = (||or - q||^2 - ||q||^2 - ||or||^2)
+        return -0.5f * (pre_dist - query_norm_sqr);
+    }
+}
+
+void RaBitDistanceComputerQ::set_query(const float* x) {
+    FAISS_ASSERT(x != nullptr);
+    FAISS_ASSERT(
+            (metric_type == MetricType::METRIC_L2 ||
+             metric_type == MetricType::METRIC_INNER_PRODUCT));
+
+    // compute the distance from the query to the centroid
+    if (centroid != nullptr) {
+        query_fac.qr_to_c_L2sqr = fvec_L2sqr(x, centroid, d);
+    } else {
+        query_fac.qr_to_c_L2sqr = fvec_norm_L2sqr(x, d);
+    }
+
+    // allocate space
+    rotated_qq.resize(d);
+
+    // rotate the query
+    std::vector<float> rotated_q(d);
+    for (size_t i = 0; i < d; i++) {
+        rotated_q[i] = x[i] - ((centroid == nullptr) ? 0 : centroid[i]);
+    }
+
+    // compute some numbers
+    const float inv_d = (d == 0) ? 1.0f : (1.0f / std::sqrt((float)d));
+
+    // quantize the query. compute min and max
+    float v_min = std::numeric_limits<float>::max();
+    float v_max = std::numeric_limits<float>::lowest();
+    for (size_t i = 0; i < d; i++) {
+        const float v_q = rotated_q[i];
+        v_min = std::min(v_min, v_q);
+        v_max = std::max(v_max, v_q);
+    }
+
+    const float pow_2_qb = 1 << qb;
+
+    const float delta = (v_max - v_min) / (pow_2_qb - 1);
+    const float inv_delta = 1.0f / delta;
+
+    size_t sum_qq = 0;
+    for (int32_t i = 0; i < d; i++) {
+        const float v_q = rotated_q[i];
+
+        // a default non-randomized SQ
+        const int v_qq = std::round((v_q - v_min) * inv_delta);
+
+        rotated_qq[i] = std::min(255, std::max(0, v_qq));
+        sum_qq += v_qq;
+    }
+
+    // rearrange the query vector
+    popcount_aligned_dim = ((d + 7) / 8) * 8;
+    size_t offset = (d + 7) / 8;
+
+    rearranged_rotated_qq.resize(offset * qb);
+    std::fill(rearranged_rotated_qq.begin(), rearranged_rotated_qq.end(), 0);
+
+    for (size_t idim = 0; idim < d; idim++) {
+        for (size_t iv = 0; iv < qb; iv++) {
+            const bool bit = ((rotated_qq[idim] & (1 << iv)) != 0);
+            rearranged_rotated_qq[iv * offset + idim / 8] |=
+                    bit ? (1 << (idim % 8)) : 0;
+        }
+    }
+
+    query_fac.c1 = 2 * delta * inv_d;
+    query_fac.c2 = 2 * v_min * inv_d;
+    query_fac.c34 = inv_d * (delta * sum_qq + d * v_min);
+
+    if (metric_type == MetricType::METRIC_INNER_PRODUCT) {
+        // precompute if needed
+        query_fac.qr_norm_L2sqr = fvec_norm_L2sqr(x, d);
+    }
+}
+
+FlatCodesDistanceComputer* RaBitQuantizer::get_distance_computer(
+        uint8_t qb,
+        const float* centroid_in) const {
+    if (qb == 0) {
+        auto dc = std::make_unique<RaBitDistanceComputerNotQ>();
+        dc->metric_type = metric_type;
+        dc->d = d;
+        dc->centroid = centroid_in;
+
+        return dc.release();
+    } else {
+        auto dc = std::make_unique<RaBitDistanceComputerQ>();
+        dc->metric_type = metric_type;
+        dc->d = d;
+        dc->centroid = centroid_in;
+        dc->qb = qb;
+
+        return dc.release();
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/impl/RaBitQuantizer.h b/faiss/impl/RaBitQuantizer.h
new file mode 100644
index 0000000000..01115838a4
--- /dev/null
+++ b/faiss/impl/RaBitQuantizer.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/MetricType.h>
+#include <faiss/impl/DistanceComputer.h>
+#include <faiss/impl/Quantizer.h>
+
+namespace faiss {
+
+// the reference implementation of the https://arxiv.org/pdf/2405.12497
+//   Jianyang Gao, Cheng Long, "RaBitQ: Quantizing High-Dimensional Vectors
+//   with a Theoretical Error Bound for Approximate Nearest Neighbor Search".
+//
+// It is assumed that the Random Matrix Rotation is performed externally.
+struct RaBitQuantizer : Quantizer {
+    // all RaBitQ operations are provided against a centroid, which needs
+    //   to be provided Externally (!). Nullptr value implies that the centroid
+    //   consists of zero values.
+    // This is the default value that can be customized using XYZ_core() calls.
+    //   Such a customization is needed for IVF calls.
+    //
+    // This particular pointer will NOT be serialized.
+    float* centroid = nullptr;
+
+    // RaBitQ codes computations are independent from a metric. But it is needed
+    //   to store some additional fp32 constants together with a quantized code.
+    //   A decision was made to make this quantizer as space efficient as
+    //   possible. Thus, a quantizer has to introduce a metric.
+    MetricType metric_type = MetricType::METRIC_L2;
+
+    RaBitQuantizer(size_t d = 0, MetricType metric = MetricType::METRIC_L2);
+
+    void train(size_t n, const float* x) override;
+
+    // every vector is expected to take (d + 7) / 8 + sizeof(FactorsData) bytes,
+    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
+
+    void compute_codes_core(
+            const float* x,
+            uint8_t* codes,
+            size_t n,
+            const float* centroid_in) const;
+
+    // The decode output is Heavily geared towards maintaining the IP, not L2.
+    // This means that the reconstructed codes maybe less accurate than one may
+    //   expect, if one computes an L2 distance between a reconstructed code and
+    //   the corresponding original vector.
+    // But value of the dot product between a query and the original vector
+    //   might be very close to the value of the dot product between a query and
+    //   the reconstructed code.
+    // Basically, it seems to be related to the distributions of values, not
+    //   values.
+    void decode(const uint8_t* codes, float* x, size_t n) const override;
+
+    void decode_core(
+            const uint8_t* codes,
+            float* x,
+            size_t n,
+            const float* centroid_in) const;
+
+    // returns the distance computer.
+    // specify qb = 0 to get an DC that does not quantize a query
+    // specify qb > 0 to have SQ qb-bits query
+    FlatCodesDistanceComputer* get_distance_computer(
+            uint8_t qb,
+            const float* centroid_in = nullptr) const;
+};
+
+} // namespace faiss
diff --git a/faiss/impl/ResultHandler.h b/faiss/impl/ResultHandler.h
index 104b34db3b..c5bc3ca76e 100644
--- a/faiss/impl/ResultHandler.h
+++ b/faiss/impl/ResultHandler.h
@@ -535,7 +535,7 @@ struct RangeSearchBlockResultHandler : BlockResultHandler<C, use_sel> {
             try {
                 // finalize the partial result
                 pres.finalize();
-            } catch (const faiss::FaissException& e) {
+            } catch ([[maybe_unused]] const faiss::FaissException& e) {
                 // Do nothing if allocation fails in finalizing partial results.
 #ifndef NDEBUG
                 std::cerr << e.what() << std::endl;
@@ -599,7 +599,7 @@ struct RangeSearchBlockResultHandler : BlockResultHandler<C, use_sel> {
             if (partial_results.size() > 0) {
                 RangeSearchPartialResult::merge(partial_results);
             }
-        } catch (const faiss::FaissException& e) {
+        } catch ([[maybe_unused]] const faiss::FaissException& e) {
             // Do nothing if allocation fails in merge.
 #ifndef NDEBUG
             std::cerr << e.what() << std::endl;
diff --git a/faiss/impl/code_distance/code_distance-sve.h b/faiss/impl/code_distance/code_distance-sve.h
index 713b7d8099..82f7746be6 100644
--- a/faiss/impl/code_distance/code_distance-sve.h
+++ b/faiss/impl/code_distance/code_distance-sve.h
@@ -14,6 +14,7 @@
 #include <tuple>
 #include <type_traits>
 
+#include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/code_distance/code_distance-generic.h>
 
 namespace faiss {
@@ -48,7 +49,7 @@ static inline void distance_codes_kernel(
     partialSum = svadd_f32_m(pg, partialSum, collected);
 }
 
-static float distance_single_code_sve_for_small_m(
+static inline float distance_single_code_sve_for_small_m(
         // the product quantizer
         const size_t M,
         // precomputed distances, layout (M, ksub)
@@ -196,7 +197,7 @@ distance_four_codes_sve(
             result3);
 }
 
-static void distance_four_codes_sve_for_small_m(
+static inline void distance_four_codes_sve_for_small_m(
         // the product quantizer
         const size_t M,
         // precomputed distances, layout (M, ksub)
@@ -217,8 +218,6 @@ static void distance_four_codes_sve_for_small_m(
 
     const auto offsets_0 = svindex_u32(0, static_cast<uint32_t>(ksub));
 
-    const auto quad_lanes = svcntw();
-
     // loop
     const auto pg = svwhilelt_b32_u64(0, M);
 
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 4c1bc27c28..44c0c1e731 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -12,6 +12,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <optional>
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/io.h>
@@ -33,6 +34,7 @@
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/IndexLattice.h>
@@ -41,6 +43,7 @@
 #include <faiss/IndexPQ.h>
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
+#include <faiss/IndexRaBitQ.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
@@ -53,8 +56,141 @@
 #include <faiss/IndexBinaryHash.h>
 #include <faiss/IndexBinaryIVF.h>
 
+// mmap-ing and viewing facilities
+#include <faiss/impl/maybe_owned_vector.h>
+
+#include <faiss/impl/mapped_io.h>
+#include <faiss/impl/zerocopy_io.h>
+
 namespace faiss {
 
+/*************************************************************
+ * Mmap-ing and viewing facilities
+ **************************************************************/
+
+// This is a baseline functionality for reading mmapped and zerocopied vector.
+// * if `beforeknown_size` is defined, then a size of the vector won't be read.
+// * if `size_multiplier` is defined, then a size will be multiplied by it.
+// * returns true is the case was handled; ownerwise, false
+template <typename VectorT>
+bool read_vector_base(
+        VectorT& target,
+        IOReader* f,
+        const std::optional<size_t> beforeknown_size,
+        const std::optional<size_t> size_multiplier) {
+    // check if the use case is right
+    if constexpr (is_maybe_owned_vector_v<VectorT>) {
+        // is it a mmap-enabled reader?
+        MappedFileIOReader* mf = dynamic_cast<MappedFileIOReader*>(f);
+        if (mf != nullptr) {
+            // read the size or use a known one
+            size_t size = 0;
+            if (beforeknown_size.has_value()) {
+                size = beforeknown_size.value();
+            } else {
+                READANDCHECK(&size, 1);
+            }
+
+            // perform the size multiplication
+            size *= size_multiplier.value_or(1);
+
+            // ok, mmap and check
+            char* address = nullptr;
+            const size_t nread = mf->mmap(
+                    (void**)&address,
+                    sizeof(typename VectorT::value_type),
+                    size);
+
+            FAISS_THROW_IF_NOT_FMT(
+                    nread == (size),
+                    "read error in %s: %zd != %zd (%s)",
+                    f->name.c_str(),
+                    nread,
+                    size,
+                    strerror(errno));
+
+            VectorT mmapped_view =
+                    VectorT::create_view(address, nread, mf->mmap_owner);
+            target = std::move(mmapped_view);
+
+            return true;
+        }
+
+        // is it a zero-copy reader?
+        ZeroCopyIOReader* zr = dynamic_cast<ZeroCopyIOReader*>(f);
+        if (zr != nullptr) {
+            // read the size or use a known one
+            size_t size = 0;
+            if (beforeknown_size.has_value()) {
+                size = beforeknown_size.value();
+            } else {
+                READANDCHECK(&size, 1);
+            }
+
+            // perform the size multiplication
+            size *= size_multiplier.value_or(1);
+
+            // create a view
+            char* address = nullptr;
+            size_t nread = zr->get_data_view(
+                    (void**)&address,
+                    sizeof(typename VectorT::value_type),
+                    size);
+
+            FAISS_THROW_IF_NOT_FMT(
+                    nread == (size),
+                    "read error in %s: %zd != %zd (%s)",
+                    f->name.c_str(),
+                    nread,
+                    size_t(size),
+                    strerror(errno));
+
+            VectorT view = VectorT::create_view(address, nread, nullptr);
+            target = std::move(view);
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
+// a replacement for READANDCHECK for reading data into std::vector
+template <typename VectorT>
+void read_vector_with_known_size(VectorT& target, IOReader* f, size_t size) {
+    // size is known beforehand, no size multiplication
+    if (read_vector_base<VectorT>(target, f, size, std::nullopt)) {
+        return;
+    }
+
+    // the default case
+    READANDCHECK(target.data(), size);
+}
+
+// a replacement for READVECTOR
+template <typename VectorT>
+void read_vector(VectorT& target, IOReader* f) {
+    // size is not known beforehand, no size multiplication
+    if (read_vector_base<VectorT>(target, f, std::nullopt, std::nullopt)) {
+        return;
+    }
+
+    // the default case
+    READVECTOR(target);
+}
+
+// a replacement for READXBVECTOR
+template <typename VectorT>
+void read_xb_vector(VectorT& target, IOReader* f) {
+    // size is not known beforehand, nultiply the size 4x
+    if (read_vector_base<VectorT>(target, f, std::nullopt, 4)) {
+        return;
+    }
+
+    // the default case
+    READXBVECTOR(target);
+}
+
 /*************************************************************
  * Read
  **************************************************************/
@@ -206,8 +342,9 @@ InvertedLists* read_InvertedLists(IOReader* f, int io_flags) {
             if (n > 0) {
                 ails->ids[i].resize(n);
                 ails->codes[i].resize(n * ails->code_size);
-                READANDCHECK(ails->codes[i].data(), n * ails->code_size);
-                READANDCHECK(ails->ids[i].data(), n);
+                read_vector_with_known_size(
+                        ails->codes[i], f, n * ails->code_size);
+                read_vector_with_known_size(ails->ids[i], f, n);
             }
         }
         return ails;
@@ -276,7 +413,7 @@ static void read_AdditiveQuantizer(AdditiveQuantizer* aq, IOReader* f) {
         aq->search_type == AdditiveQuantizer::ST_norm_cqint4 ||
         aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
         aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
-        READXBVECTOR(aq->qnorm.codes);
+        read_xb_vector(aq->qnorm.codes, f);
         aq->qnorm.ntotal = aq->qnorm.codes.size() / 4;
         aq->qnorm.update_permutation();
     }
@@ -366,7 +503,7 @@ static void read_HNSW(HNSW* hnsw, IOReader* f) {
     READVECTOR(hnsw->cum_nneighbor_per_level);
     READVECTOR(hnsw->levels);
     READVECTOR(hnsw->offsets);
-    READVECTOR(hnsw->neighbors);
+    read_vector(hnsw->neighbors, f);
 
     READ1(hnsw->entry_point);
     READ1(hnsw->max_level);
@@ -440,6 +577,13 @@ ProductQuantizer* read_ProductQuantizer(IOReader* reader) {
     return pq;
 }
 
+static void read_RaBitQuantizer(RaBitQuantizer* rabitq, IOReader* f) {
+    // don't care about rabitq->centroid
+    READ1(rabitq->d);
+    READ1(rabitq->code_size);
+    READ1(rabitq->metric_type);
+}
+
 void read_direct_map(DirectMap* dm, IOReader* f) {
     char maintain_direct_map;
     READ1(maintain_direct_map);
@@ -479,7 +623,12 @@ ArrayInvertedLists* set_array_invlist(
         std::vector<std::vector<idx_t>>& ids) {
     ArrayInvertedLists* ail =
             new ArrayInvertedLists(ivf->nlist, ivf->code_size);
-    std::swap(ail->ids, ids);
+
+    ail->ids.resize(ids.size());
+    for (size_t i = 0; i < ids.size(); i++) {
+        ail->ids[i] = MaybeOwnedVector<idx_t>(std::move(ids[i]));
+    }
+
     ivf->invlists = ail;
     ivf->own_invlists = true;
     return ail;
@@ -547,7 +696,7 @@ Index* read_index(IOReader* f, int io_flags) {
         read_index_header(idxf, f);
         idxf->code_size = idxf->d * sizeof(float);
 
-        READXBVECTOR(idxf->codes);
+        read_xb_vector(idxf->codes, f);
         FAISS_THROW_IF_NOT(
             idxf->codes.size() == idxf->ntotal * idxf->code_size);
         // leak!
@@ -578,7 +727,7 @@ Index* read_index(IOReader* f, int io_flags) {
             idxl->rrot = *rrot;
             delete rrot;
         }
-        READVECTOR(idxl->codes);
+        read_vector(idxl->codes, f);
         FAISS_THROW_IF_NOT(
                 idxl->rrot.d_in == idxl->d && idxl->rrot.d_out == idxl->nbits);
         FAISS_THROW_IF_NOT(
@@ -591,7 +740,7 @@ Index* read_index(IOReader* f, int io_flags) {
         read_index_header(idxp, f);
         read_ProductQuantizer(&idxp->pq, f);
         idxp->code_size = idxp->pq.code_size;
-        READVECTOR(idxp->codes);
+        read_vector(idxp->codes, f);
         if (h == fourcc("IxPo") || h == fourcc("IxPq")) {
             READ1(idxp->search_type);
             READ1(idxp->encode_signs);
@@ -613,28 +762,28 @@ Index* read_index(IOReader* f, int io_flags) {
             read_ResidualQuantizer(&idxr->rq, f, io_flags);
         }
         READ1(idxr->code_size);
-        READVECTOR(idxr->codes);
+        read_vector(idxr->codes, f);
         idx = idxr;
     } else if (h == fourcc("IxLS")) {
         auto idxr = new IndexLocalSearchQuantizer();
         read_index_header(idxr, f);
         read_LocalSearchQuantizer(&idxr->lsq, f);
         READ1(idxr->code_size);
-        READVECTOR(idxr->codes);
+        read_vector(idxr->codes, f);
         idx = idxr;
     } else if (h == fourcc("IxPR")) {
         auto idxpr = new IndexProductResidualQuantizer();
         read_index_header(idxpr, f);
         read_ProductResidualQuantizer(&idxpr->prq, f, io_flags);
         READ1(idxpr->code_size);
-        READVECTOR(idxpr->codes);
+        read_vector(idxpr->codes, f);
         idx = idxpr;
     } else if (h == fourcc("IxPL")) {
         auto idxpl = new IndexProductLocalSearchQuantizer();
         read_index_header(idxpl, f);
         read_ProductLocalSearchQuantizer(&idxpl->plsq, f);
         READ1(idxpl->code_size);
-        READVECTOR(idxpl->codes);
+        read_vector(idxpl->codes, f);
         idx = idxpl;
     } else if (h == fourcc("ImRQ")) {
         ResidualCoarseQuantizer* idxr = new ResidualCoarseQuantizer();
@@ -791,7 +940,7 @@ Index* read_index(IOReader* f, int io_flags) {
         IndexScalarQuantizer* idxs = new IndexScalarQuantizer();
         read_index_header(idxs, f);
         read_ScalarQuantizer(&idxs->sq, f);
-        READVECTOR(idxs->codes);
+        read_vector(idxs->codes, f);
         idxs->code_size = idxs->sq.code_size;
         idx = idxs;
     } else if (h == fourcc("IxLa")) {
@@ -949,7 +1098,7 @@ Index* read_index(IOReader* f, int io_flags) {
         READ1(idxp->code_size_1);
         READ1(idxp->code_size_2);
         READ1(idxp->code_size);
-        READVECTOR(idxp->codes);
+        read_vector(idxp->codes, f);
         idx = idxp;
     } else if (
             h == fourcc("IHNf") || h == fourcc("IHNp") || h == fourcc("IHNs") ||
@@ -1062,6 +1211,24 @@ Index* read_index(IOReader* f, int io_flags) {
         imm->own_fields = true;
 
         idx = imm;
+    } else if (h == fourcc("Ixrq")) {
+        IndexRaBitQ* idxq = new IndexRaBitQ();
+        read_index_header(idxq, f);
+        read_RaBitQuantizer(&idxq->rabitq, f);
+        READVECTOR(idxq->codes);
+        READVECTOR(idxq->center);
+        READ1(idxq->qb);
+        idxq->code_size = idxq->rabitq.code_size;
+        idx = idxq;
+    } else if (h == fourcc("Iwrq")) {
+        IndexIVFRaBitQ* ivrq = new IndexIVFRaBitQ();
+        read_ivf_header(ivrq, f);
+        read_RaBitQuantizer(&ivrq->rabitq, f);
+        READ1(ivrq->code_size);
+        READ1(ivrq->by_residual);
+        READ1(ivrq->qb);
+        read_InvertedLists(ivrq, f, io_flags);
+        idx = ivrq;
     } else {
         FAISS_THROW_FMT(
                 "Index type 0x%08x (\"%s\") not recognized",
@@ -1073,14 +1240,28 @@ Index* read_index(IOReader* f, int io_flags) {
 }
 
 Index* read_index(FILE* f, int io_flags) {
-    FileIOReader reader(f);
-    return read_index(&reader, io_flags);
+    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
+        // enable mmap-supporting IOReader
+        auto owner = std::make_shared<MmappedFileMappingOwner>(f);
+        MappedFileIOReader reader(owner);
+        return read_index(&reader, io_flags);
+    } else {
+        FileIOReader reader(f);
+        return read_index(&reader, io_flags);
+    }
 }
 
 Index* read_index(const char* fname, int io_flags) {
-    FileIOReader reader(fname);
-    Index* idx = read_index(&reader, io_flags);
-    return idx;
+    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
+        // enable mmap-supporting IOReader
+        auto owner = std::make_shared<MmappedFileMappingOwner>(fname);
+        MappedFileIOReader reader(owner);
+        return read_index(&reader, io_flags);
+    } else {
+        FileIOReader reader(fname);
+        Index* idx = read_index(&reader, io_flags);
+        return idx;
+    }
 }
 
 VectorTransform* read_VectorTransform(const char* fname) {
@@ -1183,7 +1364,7 @@ IndexBinary* read_index_binary(IOReader* f, int io_flags) {
     if (h == fourcc("IBxF")) {
         IndexBinaryFlat* idxf = new IndexBinaryFlat();
         read_index_binary_header(idxf, f);
-        READVECTOR(idxf->xb);
+        read_vector(idxf->xb, f);
         FAISS_THROW_IF_NOT(idxf->xb.size() == idxf->ntotal * idxf->code_size);
         // leak!
         idx = idxf;
@@ -1251,14 +1432,28 @@ IndexBinary* read_index_binary(IOReader* f, int io_flags) {
 }
 
 IndexBinary* read_index_binary(FILE* f, int io_flags) {
-    FileIOReader reader(f);
-    return read_index_binary(&reader, io_flags);
+    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
+        // enable mmap-supporting IOReader
+        auto owner = std::make_shared<MmappedFileMappingOwner>(f);
+        MappedFileIOReader reader(owner);
+        return read_index_binary(&reader, io_flags);
+    } else {
+        FileIOReader reader(f);
+        return read_index_binary(&reader, io_flags);
+    }
 }
 
 IndexBinary* read_index_binary(const char* fname, int io_flags) {
-    FileIOReader reader(fname);
-    IndexBinary* idx = read_index_binary(&reader, io_flags);
-    return idx;
+    if ((io_flags & IO_FLAG_MMAP_IFC) == IO_FLAG_MMAP_IFC) {
+        // enable mmap-supporting IOReader
+        auto owner = std::make_shared<MmappedFileMappingOwner>(fname);
+        MappedFileIOReader reader(owner);
+        return read_index_binary(&reader, io_flags);
+    } else {
+        FileIOReader reader(fname);
+        IndexBinary* idx = read_index_binary(&reader, io_flags);
+        return idx;
+    }
 }
 
 } // namespace faiss
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 0118ef4711..5b65454fe3 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -32,6 +32,7 @@
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/IndexLattice.h>
@@ -40,6 +41,7 @@
 #include <faiss/IndexPQ.h>
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
+#include <faiss/IndexRaBitQ.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
@@ -364,6 +366,13 @@ static void write_NNDescent(const NNDescent* nnd, IOWriter* f) {
     WRITEVECTOR(nnd->final_graph);
 }
 
+static void write_RaBitQuantizer(const RaBitQuantizer* rabitq, IOWriter* f) {
+    // don't care about rabitq->centroid
+    WRITE1(rabitq->d);
+    WRITE1(rabitq->code_size);
+    WRITE1(rabitq->metric_type);
+}
+
 static void write_direct_map(const DirectMap* dm, IOWriter* f) {
     char maintain_direct_map =
             (char)dm->type; // for backwards compatibility with bool
@@ -850,6 +859,26 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) {
         WRITE1(h);
         write_index_header(imm_2, f);
         write_index(imm_2->index, f);
+    } else if (
+            const IndexRaBitQ* idxq = dynamic_cast<const IndexRaBitQ*>(idx)) {
+        uint32_t h = fourcc("Ixrq");
+        WRITE1(h);
+        write_index_header(idx, f);
+        write_RaBitQuantizer(&idxq->rabitq, f);
+        WRITEVECTOR(idxq->codes);
+        WRITEVECTOR(idxq->center);
+        WRITE1(idxq->qb);
+    } else if (
+            const IndexIVFRaBitQ* ivrq =
+                    dynamic_cast<const IndexIVFRaBitQ*>(idx)) {
+        uint32_t h = fourcc("Iwrq");
+        WRITE1(h);
+        write_ivf_header(ivrq, f);
+        write_RaBitQuantizer(&ivrq->rabitq, f);
+        WRITE1(ivrq->code_size);
+        WRITE1(ivrq->by_residual);
+        WRITE1(ivrq->qb);
+        write_InvertedLists(ivrq->invlists, f);
     } else {
         FAISS_THROW_MSG("don't know how to serialize this type of index");
     }
diff --git a/faiss/impl/io.h b/faiss/impl/io.h
index 9e28d64e9d..a2def099b5 100644
--- a/faiss/impl/io.h
+++ b/faiss/impl/io.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
 #include <cstdio>
 #include <string>
 #include <vector>
 
-#include <faiss/Index.h>
-
 namespace faiss {
 
 struct IOReader {
diff --git a/faiss/impl/io_macros.h b/faiss/impl/io_macros.h
index c874ccf35c..5449ba1cc0 100644
--- a/faiss/impl/io_macros.h
+++ b/faiss/impl/io_macros.h
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <faiss/impl/maybe_owned_vector.h>
+
 /*************************************************************
  * I/O macros
  *
diff --git a/faiss/impl/mapped_io.cpp b/faiss/impl/mapped_io.cpp
new file mode 100644
index 0000000000..32486a9e6d
--- /dev/null
+++ b/faiss/impl/mapped_io.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __linux__
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#elif defined(_WIN32)
+
+#include <Windows.h> // @manual
+#include <io.h>      // @manual
+
+#endif
+
+#include <cstring>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/mapped_io.h>
+
+namespace faiss {
+
+#ifdef __linux__
+
+struct MmappedFileMappingOwner::PImpl {
+    void* ptr = nullptr;
+    size_t ptr_size = 0;
+
+    PImpl(const std::string& filename) {
+        auto f = std::unique_ptr<FILE, decltype(&fclose)>(
+                fopen(filename.c_str(), "r"), &fclose);
+        FAISS_THROW_IF_NOT_FMT(
+                f.get(),
+                "could not open %s for reading: %s",
+                filename.c_str(),
+                strerror(errno));
+
+        // get the size
+        struct stat s;
+        int status = fstat(fileno(f.get()), &s);
+        FAISS_THROW_IF_NOT_FMT(
+                status >= 0, "fstat() failed: %s", strerror(errno));
+
+        const size_t filesize = s.st_size;
+
+        void* address = mmap(
+                nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f.get()), 0);
+        FAISS_THROW_IF_NOT_FMT(
+                address != nullptr, "could not mmap(): %s", strerror(errno));
+
+        // btw, fd can be closed here
+
+        madvise(address, filesize, MADV_RANDOM);
+
+        // save it
+        ptr = address;
+        ptr_size = filesize;
+    }
+
+    PImpl(FILE* f) {
+        // get the size
+        struct stat s;
+        int status = fstat(fileno(f), &s);
+        FAISS_THROW_IF_NOT_FMT(
+                status >= 0, "fstat() failed: %s", strerror(errno));
+
+        const size_t filesize = s.st_size;
+
+        void* address =
+                mmap(nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f), 0);
+        FAISS_THROW_IF_NOT_FMT(
+                address != nullptr, "could not mmap(): %s", strerror(errno));
+
+        // btw, fd can be closed here
+
+        madvise(address, filesize, MADV_RANDOM);
+
+        // save it
+        ptr = address;
+        ptr_size = filesize;
+    }
+
+    ~PImpl() {
+        // todo: check for an error
+        munmap(ptr, ptr_size);
+    }
+};
+
+#elif defined(_WIN32)
+
+struct MmappedFileMappingOwner::PImpl {
+    void* ptr = nullptr;
+    size_t ptr_size = 0;
+    HANDLE mapping_handle = INVALID_HANDLE_VALUE;
+
+    PImpl(const std::string& filename) {
+        HANDLE file_handle = CreateFile(
+                filename.c_str(),
+                GENERIC_READ,
+                FILE_SHARE_READ,
+                nullptr,
+                OPEN_EXISTING,
+                0,
+                nullptr);
+        if (file_handle == INVALID_HANDLE_VALUE) {
+            const auto error = GetLastError();
+            FAISS_THROW_FMT(
+                    "could not open the file, %s (error %d)",
+                    filename.c_str(),
+                    error);
+        }
+
+        // get the size of the file
+        LARGE_INTEGER len_li;
+        if (GetFileSizeEx(file_handle, &len_li) == 0) {
+            const auto error = GetLastError();
+
+            CloseHandle(file_handle);
+
+            FAISS_THROW_FMT(
+                    "could not get the file size, %s (error %d)",
+                    filename.c_str(),
+                    error);
+        }
+
+        // create a mapping
+        mapping_handle = CreateFileMapping(
+                file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
+        if (mapping_handle == 0) {
+            const auto error = GetLastError();
+
+            CloseHandle(file_handle);
+
+            FAISS_THROW_FMT(
+                    "could not create a file mapping, %s (error %d)",
+                    filename.c_str(),
+                    error);
+        }
+        CloseHandle(file_handle);
+
+        char* data =
+                (char*)MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
+        if (data == nullptr) {
+            const auto error = GetLastError();
+
+            CloseHandle(mapping_handle);
+            mapping_handle = INVALID_HANDLE_VALUE;
+
+            FAISS_THROW_FMT(
+                    "could not get map the file, %s (error %d)",
+                    filename.c_str(),
+                    error);
+        }
+
+        ptr = data;
+        ptr_size = len_li.QuadPart;
+    }
+
+    PImpl(FILE* f) {
+        // obtain a HANDLE from a FILE
+        const int fd = _fileno(f);
+        if (fd == -1) {
+            // no good
+            FAISS_THROW_FMT("could not get a HANDLE");
+        }
+
+        HANDLE file_handle = (HANDLE)_get_osfhandle(fd);
+        if (file_handle == INVALID_HANDLE_VALUE) {
+            FAISS_THROW_FMT("could not get an OS HANDLE");
+        }
+
+        // get the size of the file
+        LARGE_INTEGER len_li;
+        if (GetFileSizeEx(file_handle, &len_li) == 0) {
+            const auto error = GetLastError();
+            FAISS_THROW_FMT("could not get the file size (error %d)", error);
+        }
+
+        // create a mapping
+        mapping_handle = CreateFileMapping(
+                file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
+        if (mapping_handle == 0) {
+            const auto error = GetLastError();
+            FAISS_THROW_FMT(
+                    "could not create a file mapping, (error %d)", error);
+        }
+
+        // the handle is provided externally, so this is not our business
+        //   to close file_handle.
+
+        char* data =
+                (char*)MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
+        if (data == nullptr) {
+            const auto error = GetLastError();
+
+            CloseHandle(mapping_handle);
+            mapping_handle = INVALID_HANDLE_VALUE;
+
+            FAISS_THROW_FMT("could not get map the file, (error %d)", error);
+        }
+
+        ptr = data;
+        ptr_size = len_li.QuadPart;
+    }
+
+    ~PImpl() {
+        if (mapping_handle != INVALID_HANDLE_VALUE) {
+            UnmapViewOfFile(ptr);
+            CloseHandle(mapping_handle);
+
+            mapping_handle = INVALID_HANDLE_VALUE;
+            ptr = nullptr;
+        }
+    }
+};
+
+#else
+
+struct MmappedFileMappingOwner::PImpl {
+    void* ptr = nullptr;
+    size_t ptr_size = 0;
+
+    PImpl(const std::string& filename) {
+        FAISS_THROW_MSG("Not implemented");
+    }
+
+    PImpl(FILE* f) {
+        FAISS_THROW_MSG("Not implemented");
+    }
+};
+
+#endif
+
+MmappedFileMappingOwner::MmappedFileMappingOwner(const std::string& filename) {
+    p_impl = std::make_unique<MmappedFileMappingOwner::PImpl>(filename);
+}
+
+MmappedFileMappingOwner::MmappedFileMappingOwner(FILE* f) {
+    p_impl = std::make_unique<MmappedFileMappingOwner::PImpl>(f);
+}
+
+MmappedFileMappingOwner::~MmappedFileMappingOwner() = default;
+
+//
+void* MmappedFileMappingOwner::data() const {
+    return p_impl->ptr;
+}
+
+size_t MmappedFileMappingOwner::size() const {
+    return p_impl->ptr_size;
+}
+
+MappedFileIOReader::MappedFileIOReader(
+        const std::shared_ptr<MmappedFileMappingOwner>& owner)
+        : mmap_owner(owner) {}
+
+// this operation performs a copy
+size_t MappedFileIOReader::operator()(void* ptr, size_t size, size_t nitems) {
+    if (size * nitems == 0) {
+        return 0;
+    }
+
+    char* ptr_c = nullptr;
+
+    const size_t actual_nitems = this->mmap((void**)&ptr_c, size, nitems);
+    if (actual_nitems > 0) {
+        memcpy(ptr, ptr_c, size * actual_nitems);
+    }
+
+    return actual_nitems;
+}
+
+// this operation returns a mmapped address, owned by mmap_owner
+size_t MappedFileIOReader::mmap(void** ptr, size_t size, size_t nitems) {
+    if (size == 0) {
+        return nitems;
+    }
+
+    size_t actual_size = size * nitems;
+    if (pos + size * nitems > mmap_owner->size()) {
+        actual_size = mmap_owner->size() - pos;
+    }
+
+    size_t actual_nitems = (actual_size + size - 1) / size;
+    if (actual_nitems == 0) {
+        return 0;
+    }
+
+    // get an address
+    *ptr = (void*)(reinterpret_cast<const char*>(mmap_owner->data()) + pos);
+
+    // alter pos
+    pos += size * actual_nitems;
+
+    return actual_nitems;
+}
+
+int MappedFileIOReader::filedescriptor() {
+    // todo
+    return -1;
+}
+
+} // namespace faiss
diff --git a/faiss/impl/mapped_io.h b/faiss/impl/mapped_io.h
new file mode 100644
index 0000000000..0e32df23d8
--- /dev/null
+++ b/faiss/impl/mapped_io.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <faiss/impl/io.h>
+#include <faiss/impl/maybe_owned_vector.h>
+
+namespace faiss {
+
+// holds a memory-mapped region over a file
+struct MmappedFileMappingOwner : public MaybeOwnedVectorOwner {
+    MmappedFileMappingOwner(const std::string& filename);
+    MmappedFileMappingOwner(FILE* f);
+    ~MmappedFileMappingOwner();
+
+    void* data() const;
+    size_t size() const;
+
+    struct PImpl;
+    std::unique_ptr<PImpl> p_impl;
+};
+
+// A deserializer that supports memory-mapped files.
+// All de-allocations should happen as soon as the index gets destroyed,
+//   after all underlying the MaybeOwnerVector objects are destroyed.
+struct MappedFileIOReader : IOReader {
+    std::shared_ptr<MmappedFileMappingOwner> mmap_owner;
+
+    size_t pos = 0;
+
+    MappedFileIOReader(const std::shared_ptr<MmappedFileMappingOwner>& owner);
+
+    // perform a copy
+    size_t operator()(void* ptr, size_t size, size_t nitems) override;
+    // perform a quasi-read that returns a mmapped address, owned by mmap_owner,
+    //   and updates the position
+    size_t mmap(void** ptr, size_t size, size_t nitems);
+
+    int filedescriptor() override;
+};
+
+} // namespace faiss
diff --git a/faiss/impl/maybe_owned_vector.h b/faiss/impl/maybe_owned_vector.h
new file mode 100644
index 0000000000..4b6770dac8
--- /dev/null
+++ b/faiss/impl/maybe_owned_vector.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+// An interface for an owner of a MaybeOwnedVector.
+struct MaybeOwnedVectorOwner {
+    virtual ~MaybeOwnedVectorOwner() = default;
+};
+
+// a container that either works as std::vector<T> that owns its own memory,
+//    or as a view of a memory buffer, with a known size
+template <typename T>
+struct MaybeOwnedVector {
+    using value_type = T;
+    using self_type = MaybeOwnedVector<T>;
+    using iterator = typename std::vector<T>::iterator;
+    using const_iterator = typename std::vector<T>::const_iterator;
+    using size_type = typename std::vector<T>::size_type;
+
+    bool is_owned = true;
+
+    // this one is used if is_owned == true
+    std::vector<T> owned_data;
+
+    // these three are used if is_owned == false
+    T* view_data = nullptr;
+    // the number of T elements
+    size_t view_size = 0;
+    // who owns the data.
+    // This field can be nullptr, and it is present ONLY in order
+    //   to avoid possible tricky memory / resource leaks.
+    std::shared_ptr<MaybeOwnedVectorOwner> owner;
+
+    // points either to view_data, or to owned.data()
+    T* c_ptr = nullptr;
+    // uses either view_size, or owned.size();
+    size_t c_size = 0;
+
+    MaybeOwnedVector() = default;
+    MaybeOwnedVector(const size_t initial_size) {
+        is_owned = true;
+
+        owned_data.resize(initial_size);
+        c_ptr = owned_data.data();
+        c_size = owned_data.size();
+    }
+
+    explicit MaybeOwnedVector(const std::vector<T>& vec)
+            : faiss::MaybeOwnedVector<T>(vec.size()) {
+        if (vec.size() > 0) {
+            memcpy(owned_data.data(), vec.data(), sizeof(T) * vec.size());
+        }
+    }
+
+    MaybeOwnedVector(const MaybeOwnedVector& other) {
+        is_owned = other.is_owned;
+        owned_data = other.owned_data;
+
+        view_data = other.view_data;
+        view_size = other.view_size;
+        owner = other.owner;
+
+        if (is_owned) {
+            c_ptr = owned_data.data();
+            c_size = owned_data.size();
+        } else {
+            c_ptr = view_data;
+            c_size = view_size;
+        }
+    }
+
+    MaybeOwnedVector(MaybeOwnedVector&& other) {
+        is_owned = other.is_owned;
+        owned_data = std::move(other.owned_data);
+
+        view_data = other.view_data;
+        view_size = other.view_size;
+        owner = std::move(other.owner);
+        other.owner = nullptr;
+
+        if (is_owned) {
+            c_ptr = owned_data.data();
+            c_size = owned_data.size();
+        } else {
+            c_ptr = view_data;
+            c_size = view_size;
+        }
+    }
+
+    MaybeOwnedVector& operator=(const MaybeOwnedVector& other) {
+        if (this == &other) {
+            return *this;
+        }
+
+        // create a copy
+        MaybeOwnedVector cloned(other);
+        // swap
+        swap(*this, cloned);
+
+        return *this;
+    }
+
+    MaybeOwnedVector& operator=(MaybeOwnedVector&& other) {
+        if (this == &other) {
+            return *this;
+        }
+
+        // moved
+        MaybeOwnedVector moved(std::move(other));
+        // swap
+        swap(*this, moved);
+
+        return *this;
+    }
+
+    MaybeOwnedVector(std::vector<T>&& other) {
+        is_owned = true;
+
+        owned_data = std::move(other);
+        c_ptr = owned_data.data();
+        c_size = owned_data.size();
+    }
+
+    static MaybeOwnedVector create_view(
+            void* address,
+            const size_t n_elements,
+            const std::shared_ptr<MaybeOwnedVectorOwner>& owner) {
+        MaybeOwnedVector vec;
+        vec.is_owned = false;
+        vec.view_data = reinterpret_cast<T*>(address);
+        vec.view_size = n_elements;
+        vec.owner = owner;
+
+        vec.c_ptr = vec.view_data;
+        vec.c_size = vec.view_size;
+
+        return vec;
+    }
+
+    const T* data() const {
+        return c_ptr;
+    }
+
+    T* data() {
+        return c_ptr;
+    }
+
+    size_t size() const {
+        return c_size;
+    }
+
+    size_t byte_size() const {
+        return c_size * sizeof(T);
+    }
+
+    T& operator[](const size_t idx) {
+        return c_ptr[idx];
+    }
+
+    const T& operator[](const size_t idx) const {
+        return c_ptr[idx];
+    }
+
+    T& at(size_type pos) {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        return owned_data.at(pos);
+    }
+
+    const T& at(size_type pos) const {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        return owned_data.at(pos);
+    }
+
+    iterator begin() {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        return owned_data.begin();
+    }
+
+    const_iterator begin() const {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        return owned_data.begin();
+    }
+
+    iterator end() {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        return owned_data.end();
+    }
+
+    const_iterator end() const {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        return owned_data.end();
+    }
+
+    iterator erase(const_iterator begin, const_iterator end) {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        auto result = owned_data.erase(begin, end);
+        c_ptr = owned_data.data();
+        c_size = owned_data.size();
+
+        return result;
+    }
+
+    template <class InputIt>
+    iterator insert(const_iterator pos, InputIt first, InputIt last) {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        auto result = owned_data.insert(pos, first, last);
+        c_ptr = owned_data.data();
+        c_size = owned_data.size();
+
+        return result;
+    }
+
+    void clear() {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        owned_data.clear();
+        c_ptr = owned_data.data();
+        c_size = owned_data.size();
+    }
+
+    void resize(const size_t new_size) {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        owned_data.resize(new_size);
+        c_ptr = owned_data.data();
+        c_size = owned_data.size();
+    }
+
+    void resize(const size_t new_size, const value_type v) {
+        FAISS_ASSERT_MSG(
+                is_owned,
+                "This operation cannot be performed on a viewed vector");
+
+        owned_data.resize(new_size, v);
+        c_ptr = owned_data.data();
+        c_size = owned_data.size();
+    }
+
+    friend void swap(self_type& a, self_type& b) {
+        std::swap(a.is_owned, b.is_owned);
+        std::swap(a.owned_data, b.owned_data);
+        std::swap(a.view_data, b.view_data);
+        std::swap(a.view_size, b.view_size);
+        std::swap(a.owner, b.owner);
+        std::swap(a.c_ptr, b.c_ptr);
+        std::swap(a.c_size, b.c_size);
+    }
+};
+
+template <typename T>
+struct is_maybe_owned_vector : std::false_type {};
+
+template <typename T>
+struct is_maybe_owned_vector<MaybeOwnedVector<T>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_maybe_owned_vector_v = is_maybe_owned_vector<T>::value;
+
+template <typename T>
+bool operator==(
+        const MaybeOwnedVector<T>& lhs,
+        const MaybeOwnedVector<T>& rhs) {
+    return lhs.size() == rhs.size() &&
+            !memcmp(lhs.data(), rhs.data(), lhs.byte_size());
+}
+
+template <typename T>
+bool operator!=(
+        const MaybeOwnedVector<T>& lhs,
+        const MaybeOwnedVector<T>& rhs) {
+    return !(lhs == rhs);
+}
+
+} // namespace faiss
diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index 5fc632eb2d..4b03fbd00a 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -12,7 +12,7 @@
 #include <cstdio>
 #include <faiss/OMPConfig.h>
 
-#ifdef _MSC_VER
+#ifdef _WIN32
 
 /*******************************************************
  * Windows specific macros
@@ -24,11 +24,11 @@
 #define FAISS_API __declspec(dllimport)
 #endif // FAISS_MAIN_LIB
 
-#ifdef _MSC_VER
 #define strtok_r strtok_s
-#endif // _MSC_VER
 
+#ifdef _MSC_VER
 #define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif // _MSC_VER
 
 #define posix_memalign(p, a, s) \
     (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
@@ -38,6 +38,7 @@
 #define ALIGNED(x) __declspec(align(x))
 
 // redefine the GCC intrinsics with Windows equivalents
+#ifdef _MSC_VER
 
 #include <intrin.h>
 #include <limits.h>
@@ -76,6 +77,7 @@ inline int __builtin_clzll(uint64_t x) {
 
 #define __builtin_popcount __popcnt
 #define __builtin_popcountl __popcnt64
+#define __builtin_popcountll __popcnt64
 
 #ifndef __clang__
 #define __m128i_u __m128i
@@ -102,6 +104,8 @@ inline int __builtin_clzll(uint64_t x) {
 #define __F16C__ 1
 #endif
 
+#endif // _MSC_VER
+
 #define FAISS_ALWAYS_INLINE __forceinline
 
 #else
diff --git a/faiss/impl/simd_result_handlers.h b/faiss/impl/simd_result_handlers.h
index e12277a690..baa640d865 100644
--- a/faiss/impl/simd_result_handlers.h
+++ b/faiss/impl/simd_result_handlers.h
@@ -576,7 +576,7 @@ struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
         normalizers = norms;
         for (int q = 0; q < nq; ++q) {
             thresholds[q] =
-                    normalizers[2 * q] * (radius - normalizers[2 * q + 1]);
+                    int(normalizers[2 * q] * (radius - normalizers[2 * q + 1]));
         }
     }
 
diff --git a/faiss/impl/zerocopy_io.cpp b/faiss/impl/zerocopy_io.cpp
new file mode 100644
index 0000000000..2d37f6a8cc
--- /dev/null
+++ b/faiss/impl/zerocopy_io.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/zerocopy_io.h>
+#include <cstring>
+
+namespace faiss {
+
+ZeroCopyIOReader::ZeroCopyIOReader(uint8_t* data, size_t size)
+        : data_(data), rp_(0), total_(size) {}
+
+ZeroCopyIOReader::~ZeroCopyIOReader() {}
+
+size_t ZeroCopyIOReader::get_data_view(void** ptr, size_t size, size_t nitems) {
+    if (size == 0) {
+        return nitems;
+    }
+
+    size_t actual_size = size * nitems;
+    if (rp_ + size * nitems > total_) {
+        actual_size = total_ - rp_;
+    }
+
+    size_t actual_nitems = (actual_size + size - 1) / size;
+    if (actual_nitems == 0) {
+        return 0;
+    }
+
+    // get an address
+    *ptr = (void*)(reinterpret_cast<const char*>(data_ + rp_));
+
+    // alter pos
+    rp_ += size * actual_nitems;
+
+    return actual_nitems;
+}
+
+void ZeroCopyIOReader::reset() {
+    rp_ = 0;
+}
+
+size_t ZeroCopyIOReader::operator()(void* ptr, size_t size, size_t nitems) {
+    if (size * nitems == 0) {
+        return 0;
+    }
+
+    if (rp_ >= total_) {
+        return 0;
+    }
+    size_t nremain = (total_ - rp_) / size;
+    if (nremain < nitems) {
+        nitems = nremain;
+    }
+    memcpy(ptr, (data_ + rp_), size * nitems);
+    rp_ += size * nitems;
+    return nitems;
+}
+
+int ZeroCopyIOReader::filedescriptor() {
+    return -1; // Indicating no file descriptor available for memory buffer
+}
+
+} // namespace faiss
diff --git a/faiss/impl/zerocopy_io.h b/faiss/impl/zerocopy_io.h
new file mode 100644
index 0000000000..488b5d1e80
--- /dev/null
+++ b/faiss/impl/zerocopy_io.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <faiss/impl/io.h>
+
+namespace faiss {
+
+// ZeroCopyIOReader just maps the data from a given pointer.
+struct ZeroCopyIOReader : public faiss::IOReader {
+    uint8_t* data_;
+    size_t rp_ = 0;
+    size_t total_ = 0;
+
+    ZeroCopyIOReader(uint8_t* data, size_t size);
+    ~ZeroCopyIOReader();
+
+    void reset();
+    size_t get_data_view(void** ptr, size_t size, size_t nitems);
+    size_t operator()(void* ptr, size_t size, size_t nitems) override;
+
+    int filedescriptor() override;
+};
+
+} // namespace faiss
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index 8ff4bfec7c..b4e0e9a48f 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -11,9 +11,6 @@
 
 #include <faiss/index_factory.h>
 
-#include <cinttypes>
-#include <cmath>
-
 #include <map>
 
 #include <regex>
@@ -33,6 +30,7 @@
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/IndexLattice.h>
@@ -40,6 +38,7 @@
 #include <faiss/IndexPQ.h>
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
+#include <faiss/IndexRaBitQ.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
@@ -67,6 +66,7 @@ namespace {
  */
 
 bool re_match(const std::string& s, const std::string& pat, std::smatch& sm) {
+    // @lint-ignore CLANGTIDY
     return std::regex_match(s, sm, std::regex(pat));
 }
 
@@ -164,7 +164,7 @@ const std::string aq_norm_pattern =
 const std::string paq_def_pattern = "([0-9]+)x([0-9]+)x([0-9]+)";
 
 AdditiveQuantizer::Search_type_t aq_parse_search_type(
-        std::string stok,
+        const std::string& stok,
         MetricType metric) {
     if (stok == "") {
         return metric == METRIC_L2 ? AdditiveQuantizer::ST_decompress
@@ -177,6 +177,7 @@ AdditiveQuantizer::Search_type_t aq_parse_search_type(
 std::vector<size_t> aq_parse_nbits(std::string stok) {
     std::vector<size_t> nbits;
     std::smatch sm;
+    // @lint-ignore CLANGTIDY
     while (std::regex_search(stok, sm, std::regex("[^q]([0-9]+)x([0-9]+)"))) {
         int M = std::stoi(sm[1].str());
         int nbit = std::stoi(sm[2].str());
@@ -186,6 +187,8 @@ std::vector<size_t> aq_parse_nbits(std::string stok) {
     return nbits;
 }
 
+const std::string rabitq_pattern = "(RaBitQ)";
+
 /***************************************************************
  * Parse VectorTransform
  */
@@ -436,6 +439,9 @@ IndexIVF* parse_IndexIVF(
         }
         return index_ivf;
     }
+    if (match(rabitq_pattern)) {
+        return new IndexIVFRaBitQ(get_q(), d, nlist, mt);
+    }
     return nullptr;
 }
 
@@ -657,6 +663,11 @@ Index* parse_other_indexes(
         }
     }
 
+    // IndexRaBitQ
+    if (match(rabitq_pattern)) {
+        return new IndexRaBitQ(d, metric);
+    }
+
     return nullptr;
 }
 
@@ -766,7 +777,7 @@ std::unique_ptr<Index> index_factory_sub(
     }
 
     if (verbose) {
-        printf("after () normalization: %s %ld parenthesis indexes d=%d\n",
+        printf("after () normalization: %s %zd parenthesis indexes d=%d\n",
                description.c_str(),
                parenthesis_indexes.size(),
                d);
diff --git a/faiss/index_io.h b/faiss/index_io.h
index 191d3b9461..8a0654dc9e 100644
--- a/faiss/index_io.h
+++ b/faiss/index_io.h
@@ -66,6 +66,10 @@ const int IO_FLAG_PQ_SKIP_SDC_TABLE = 32;
 // try to memmap data (useful to load an ArrayInvertedLists as an
 // OnDiskInvertedLists)
 const int IO_FLAG_MMAP = IO_FLAG_SKIP_IVF_DATA | 0x646f0000;
+// mmap that handles codes for IndexFlatCodes-derived indices and HNSW.
+// this is a temporary solution, it is expected to be merged with IO_FLAG_MMAP
+//   after OnDiskInvertedLists get properly updated.
+const int IO_FLAG_MMAP_IFC = 1 << 9;
 
 Index* read_index(const char* fname, int io_flags = 0);
 Index* read_index(FILE* f, int io_flags = 0);
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index ef14bcb973..899ae639cf 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -181,7 +181,7 @@ size_t InvertedLists::copy_subset_to(
 }
 
 double InvertedLists::imbalance_factor() const {
-    std::vector<int> hist(nlist);
+    std::vector<int64_t> hist(nlist);
 
     for (size_t i = 0; i < nlist; i++) {
         hist[i] = list_size(i);
@@ -330,8 +330,8 @@ void ArrayInvertedLists::update_entries(
 }
 
 void ArrayInvertedLists::permute_invlists(const idx_t* map) {
-    std::vector<std::vector<uint8_t>> new_codes(nlist);
-    std::vector<std::vector<idx_t>> new_ids(nlist);
+    std::vector<MaybeOwnedVector<uint8_t>> new_codes(nlist);
+    std::vector<MaybeOwnedVector<idx_t>> new_ids(nlist);
 
     for (size_t i = 0; i < nlist; i++) {
         size_t o = map[i];
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index f8c72c0841..78799a23b2 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -15,9 +15,11 @@
  * the interface.
  */
 
-#include <faiss/MetricType.h>
 #include <vector>
 
+#include <faiss/MetricType.h>
+#include <faiss/impl/maybe_owned_vector.h>
+
 namespace faiss {
 
 struct InvertedListsIterator {
@@ -241,8 +243,8 @@ struct InvertedLists {
 
 /// simple (default) implementation as an array of inverted lists
 struct ArrayInvertedLists : InvertedLists {
-    std::vector<std::vector<uint8_t>> codes; // binary codes, size nlist
-    std::vector<std::vector<idx_t>> ids;     ///< Inverted lists for indexes
+    std::vector<MaybeOwnedVector<uint8_t>> codes; // binary codes, size nlist
+    std::vector<MaybeOwnedVector<idx_t>> ids; ///< Inverted lists for indexes
 
     ArrayInvertedLists(size_t nlist, size_t code_size);
 
diff --git a/faiss/invlists/InvertedListsIOHook.cpp b/faiss/invlists/InvertedListsIOHook.cpp
index 0534a11907..13d8490a8d 100644
--- a/faiss/invlists/InvertedListsIOHook.cpp
+++ b/faiss/invlists/InvertedListsIOHook.cpp
@@ -13,9 +13,9 @@
 
 #include <faiss/invlists/BlockInvertedLists.h>
 
-#ifndef _MSC_VER
+#ifndef _WIN32
 #include <faiss/invlists/OnDiskInvertedLists.h>
-#endif // !_MSC_VER
+#endif // !_WIN32
 
 namespace faiss {
 
@@ -33,7 +33,7 @@ namespace {
 /// std::vector that deletes its contents
 struct IOHookTable : std::vector<InvertedListsIOHook*> {
     IOHookTable() {
-#ifndef _MSC_VER
+#ifndef _WIN32
         push_back(new OnDiskInvertedListsIOHook());
 #endif
         push_back(new BlockInvertedListsIOHook());
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 9d956ebe71..7266da71f3 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -53,6 +53,7 @@
 class_wrappers.handle_Linear(Linear)
 class_wrappers.handle_QINCo(QINCo)
 class_wrappers.handle_QINCoStep(QINCoStep)
+shard_ivf_index_centroids = class_wrappers.handle_shard_ivf_index_centroids(shard_ivf_index_centroids)
 
 
 this_module = sys.modules[__name__]
@@ -170,7 +171,7 @@ def replacement_function(*args):
     add_ref_in_constructor(GpuIndexIVFPQ, 1)
     add_ref_in_constructor(GpuIndexIVFScalarQuantizer, 1)
 except NameError as e:
-    logger.info("Failed to load GPU Faiss: %s. Will not load constructor refs for GPU indexes." % e.args[0])
+    logger.info("Failed to load GPU Faiss: %s. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss." % e.args[0])
 
 add_ref_in_constructor(IndexIVFFlat, 0)
 add_ref_in_constructor(IndexIVFFlatDedup, 0)
diff --git a/faiss/python/array_conversions.py b/faiss/python/array_conversions.py
index 0c57defe1e..b62c59e4ce 100644
--- a/faiss/python/array_conversions.py
+++ b/faiss/python/array_conversions.py
@@ -106,6 +106,13 @@ def vector_to_array(v):
     classname = v.__class__.__name__
     if classname.startswith('AlignedTable'):
         return AlignedTable_to_array(v)
+    if classname.startswith('MaybeOwnedVector'):
+        dtype = np.dtype(vector_name_map[classname[16:]])
+        a = np.empty(v.size(), dtype=dtype)
+        if v.size() > 0:
+            memcpy(swig_ptr(a), v.data(), a.nbytes)
+        return a
+
     assert classname.endswith('Vector')
     dtype = np.dtype(vector_name_map[classname[:-6]])
     a = np.empty(v.size(), dtype=dtype)
@@ -122,6 +129,17 @@ def copy_array_to_vector(a, v):
     """ copy a numpy array to a vector """
     n, = a.shape
     classname = v.__class__.__name__
+    if classname.startswith('MaybeOwnedVector'):
+        assert v.is_owned, 'cannot copy to an non-owned MaybeOwnedVector'
+        dtype = np.dtype(vector_name_map[classname[16:]])
+        assert dtype == a.dtype, (
+            'cannot copy a %s array to a %s (should be %s)' % (
+                a.dtype, classname, dtype))
+        v.resize(n)
+        if n > 0:
+            memcpy(v.data(), swig_ptr(a), a.nbytes)
+        return
+
     assert classname.endswith('Vector')
     dtype = np.dtype(vector_name_map[classname[:-6]])
     assert dtype == a.dtype, (
diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py
index 607fdd6d29..2491aa8914 100644
--- a/faiss/python/class_wrappers.py
+++ b/faiss/python/class_wrappers.py
@@ -869,7 +869,7 @@ def replacement_reconstruct_n(self, n0=0, ni=-1, x=None):
         self.reconstruct_n_c(n0, ni, swig_ptr(x))
         return x
 
-    def replacement_search(self, x, k):
+    def replacement_search(self, x, k, *, params=None):
         x = _check_dtype_uint8(x)
         n, d = x.shape
         assert d == self.code_size
@@ -878,7 +878,8 @@ def replacement_search(self, x, k):
         labels = np.empty((n, k), dtype=np.int64)
         self.search_c(n, swig_ptr(x),
                       k, swig_ptr(distances),
-                      swig_ptr(labels))
+                      swig_ptr(labels),
+                      params=params)
         return distances, labels
 
     def replacement_search_preassigned(self, x, k, Iq, Dq):
@@ -906,12 +907,12 @@ def replacement_search_preassigned(self, x, k, Iq, Dq):
         )
         return D, I
 
-    def replacement_range_search(self, x, thresh):
+    def replacement_range_search(self, x, thresh, *, params=None):
         n, d = x.shape
         x = _check_dtype_uint8(x)
         assert d == self.code_size
         res = RangeSearchResult(n)
-        self.range_search_c(n, swig_ptr(x), thresh, res)
+        self.range_search_c(n, swig_ptr(x), thresh, res, params=params)
         # get pointers and copy them
         lims = rev_swig_ptr(res.lims, n + 1).copy()
         nd = int(lims[-1])
@@ -1037,7 +1038,7 @@ def replacement_vt_train(self, x):
 
 def handle_AutoTuneCriterion(the_class):
     def replacement_set_groundtruth(self, D, I):
-        if D:
+        if D is not None:
             assert I.shape == D.shape
         self.nq, self.gt_nnn = I.shape
         self.set_groundtruth_c(
@@ -1395,3 +1396,12 @@ def from_torch(self, qinco):
 
     the_class.__init__ = replacement_init
     the_class.from_torch = from_torch
+
+
+def handle_shard_ivf_index_centroids(func):
+    def wrapper(*args, **kwargs):
+        args = list(args)
+        if len(args) > 3 and args[3] is not None:
+            args[3] = faiss.PyCallbackShardingFunction(args[3])
+        return func(*args, **kwargs)
+    return wrapper
diff --git a/faiss/python/loader.py b/faiss/python/loader.py
index caef9e5512..c3b7b00c19 100644
--- a/faiss/python/loader.py
+++ b/faiss/python/loader.py
@@ -108,7 +108,7 @@ def is_sve_supported():
         loaded = False
 
 has_AVX512 = any("AVX512" in x.upper() for x in instruction_sets)
-if has_AVX512:
+if has_AVX512 and not loaded:
     try:
         logger.info("Loading faiss with AVX512 support.")
         from .swigfaiss_avx512 import *
diff --git a/faiss/python/python_callbacks.cpp b/faiss/python/python_callbacks.cpp
index ce36bed437..8b78bf1e43 100644
--- a/faiss/python/python_callbacks.cpp
+++ b/faiss/python/python_callbacks.cpp
@@ -134,3 +134,27 @@ PyCallbackIDSelector::~PyCallbackIDSelector() {
     PyThreadLock gil;
     Py_DECREF(callback);
 }
+
+/***********************************************************
+ * Callbacks for IVF index sharding
+ ***********************************************************/
+
+PyCallbackShardingFunction::PyCallbackShardingFunction(PyObject* callback)
+        : callback(callback) {
+    PyThreadLock gil;
+    Py_INCREF(callback);
+}
+
+int64_t PyCallbackShardingFunction::operator()(int64_t i, int64_t shard_count) {
+    PyThreadLock gil;
+    PyObject* shard_id = PyObject_CallFunction(callback, "LL", i, shard_count);
+    if (shard_id == nullptr) {
+        FAISS_THROW_MSG("propagate py error");
+    }
+    return PyLong_AsLongLong(shard_id);
+}
+
+PyCallbackShardingFunction::~PyCallbackShardingFunction() {
+    PyThreadLock gil;
+    Py_DECREF(callback);
+}
diff --git a/faiss/python/python_callbacks.h b/faiss/python/python_callbacks.h
index fa8ebaf53c..072e69f91f 100644
--- a/faiss/python/python_callbacks.h
+++ b/faiss/python/python_callbacks.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/IVFlib.h>
 #include <faiss/impl/IDSelector.h>
 #include <faiss/impl/io.h>
 #include <faiss/invlists/InvertedLists.h>
@@ -58,3 +59,24 @@ struct PyCallbackIDSelector : faiss::IDSelector {
 
     ~PyCallbackIDSelector() override;
 };
+
+/***********************************************************
+ * Callbacks for IVF index sharding
+ ***********************************************************/
+
+struct PyCallbackShardingFunction : faiss::ivflib::ShardingFunction {
+    PyObject* callback;
+
+    explicit PyCallbackShardingFunction(PyObject* callback);
+
+    int64_t operator()(int64_t i, int64_t shard_count) override;
+
+    ~PyCallbackShardingFunction() override;
+
+    PyCallbackShardingFunction(const PyCallbackShardingFunction&) = delete;
+    PyCallbackShardingFunction(PyCallbackShardingFunction&&) noexcept = default;
+    PyCallbackShardingFunction& operator=(const PyCallbackShardingFunction&) =
+            default;
+    PyCallbackShardingFunction& operator=(PyCallbackShardingFunction&&) =
+            default;
+};
diff --git a/faiss/python/setup.py b/faiss/python/setup.py
index 23611cb370..b30cfa7813 100644
--- a/faiss/python/setup.py
+++ b/faiss/python/setup.py
@@ -105,7 +105,7 @@
 """
 setup(
     name="faiss",
-    version="1.10.0",
+    version="1.11.0",
     description="A library for efficient similarity search and clustering of dense vectors",
     long_description=long_description,
     url="https://github.com/facebookresearch/faiss",
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 493e42ef0e..67d903bd92 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -32,6 +32,7 @@
 #pragma SWIG nowarn=341
 #pragma SWIG nowarn=512
 #pragma SWIG nowarn=362
+#pragma SWIG nowarn=509
 
 // we need explict control of these typedefs...
 // %include <stdint.i>
@@ -81,6 +82,11 @@ typedef uint64_t size_t;
 
 #endif
 
+#include <faiss/impl/io.h>
+
+#include <faiss/impl/maybe_owned_vector.h>
+#include <faiss/impl/mapped_io.h>
+#include <faiss/impl/zerocopy_io.h>
 
 #include <faiss/IndexFlat.h>
 #include <faiss/VectorTransform.h>
@@ -184,6 +190,10 @@ typedef uint64_t size_t;
 
 #include <faiss/IndexNeuralNetCodec.h>
 
+#include <faiss/impl/RaBitQuantizer.h>
+#include <faiss/IndexRaBitQ.h>
+#include <faiss/IndexIVFRaBitQ.h>
+
 %}
 
 /********************************************************
@@ -280,6 +290,9 @@ namespace std {
 %template(RepeatVector) std::vector<faiss::Repeat>;
 %template(ClusteringIterationStatsVector) std::vector<faiss::ClusteringIterationStats>;
 %template(ParameterRangeVector) std::vector<faiss::ParameterRange>;
+%template(MaybeOwnedVectorUInt8Vector) std::vector<faiss::MaybeOwnedVector<uint8_t> >;
+%template(MaybeOwnedVectorInt32Vector) std::vector<faiss::MaybeOwnedVector<int32_t> >;
+%template(MaybeOwnedVectorFloat32Vector) std::vector<faiss::MaybeOwnedVector<float> >;
 
 #ifndef SWIGWIN
 %template(OnDiskOneListVector) std::vector<faiss::OnDiskOneList>;
@@ -506,6 +519,14 @@ void gpu_sync_all_devices()
 
 %include <faiss/impl/DistanceComputer.h>
 
+%include <faiss/impl/io.h>
+
+%ignore faiss::MmappedFileMappingOwner::p_impl;
+
+%include <faiss/impl/maybe_owned_vector.h>
+%include <faiss/impl/mapped_io.h>
+%include <faiss/impl/zerocopy_io.h>
+
 %newobject *::get_FlatCodesDistanceComputer() const;
 %include  <faiss/IndexFlatCodes.h>
 %include  <faiss/IndexFlat.h>
@@ -633,6 +654,9 @@ struct faiss::simd16uint16 {};
 
 %include <faiss/IndexNeuralNetCodec.h>
 
+%include <faiss/impl/RaBitQuantizer.h>
+%include <faiss/IndexRaBitQ.h>
+%include <faiss/IndexIVFRaBitQ.h>
 
 %ignore faiss::BufferList::Buffer;
 %ignore faiss::RangeSearchPartialResult::QueryResult;
@@ -744,6 +768,8 @@ struct faiss::simd16uint16 {};
     DOWNCAST ( IndexShardsIVF )
     DOWNCAST2 ( IndexShards, IndexShardsTemplateT_faiss__Index_t )
     DOWNCAST2 ( IndexReplicas, IndexReplicasTemplateT_faiss__Index_t )
+    DOWNCAST ( IndexRaBitQ )
+    DOWNCAST ( IndexIVFRaBitQ )
     DOWNCAST ( IndexIVFIndependentQuantizer)
     DOWNCAST ( IndexIVFPQR )
     DOWNCAST ( IndexIVFPQ )
@@ -992,6 +1018,10 @@ faiss::Quantizer * downcast_Quantizer (faiss::Quantizer *aq)
 %template(AlignedTableUint16) faiss::AlignedTable<uint16_t>;
 %template(AlignedTableFloat32) faiss::AlignedTable<float>;
 
+%template(MaybeOwnedVectorUInt8) faiss::MaybeOwnedVector<uint8_t>;
+%template(MaybeOwnedVectorInt32) faiss::MaybeOwnedVector<int32_t>;
+%template(MaybeOwnedVectorFloat32) faiss::MaybeOwnedVector<float>;
+
 
 // SWIG seems to have some trouble resolving function template types here, so
 // declare explicitly
diff --git a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
index 68d8e8c9f0..9f8d211956 100644
--- a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
+++ b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
@@ -46,9 +46,11 @@ struct HeapWithBucketsForHamming32<
             // output distances
             int* const __restrict bh_val,
             // output indices, each being within [0, n) range
-            int64_t* const __restrict bh_ids) {
+            int64_t* const __restrict bh_ids,
+            // optional id selector for filtering
+            const IDSelector* sel = nullptr) {
         // forward a call to bs_addn with 1 beam
-        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
+        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids, sel);
     }
 
     static void bs_addn(
@@ -66,7 +68,9 @@ struct HeapWithBucketsForHamming32<
             int* const __restrict bh_val,
             // output indices, each being within [0, n_per_beam * beam_size)
             // range
-            int64_t* const __restrict bh_ids) {
+            int64_t* const __restrict bh_ids,
+            // optional id selector for filtering
+            const IDSelector* sel = nullptr) {
         //
         using C = CMax<int, int64_t>;
 
@@ -95,11 +99,22 @@ struct HeapWithBucketsForHamming32<
             for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
                 for (uint32_t j = 0; j < NBUCKETS_8; j++) {
                     uint32_t hamming_distances[8];
+                    uint8_t valid_counter = 0;
                     for (size_t j8 = 0; j8 < 8; j8++) {
-                        hamming_distances[j8] = hc.hamming(
-                                binary_vectors +
-                                (j8 + j * 8 + ip + n_per_beam * beam_index) *
-                                        code_size);
+                        const uint32_t idx =
+                                j8 + j * 8 + ip + n_per_beam * beam_index;
+                        if (!sel || sel->is_member(idx)) {
+                            hamming_distances[j8] = hc.hamming(
+                                    binary_vectors + idx * code_size);
+                            valid_counter++;
+                        } else {
+                            hamming_distances[j8] =
+                                    std::numeric_limits<int32_t>::max();
+                        }
+                    }
+
+                    if (valid_counter == 8) {
+                        continue; // Skip if all vectors are filtered out
                     }
 
                     // loop. Compiler should get rid of unneeded ops
@@ -157,7 +172,8 @@ struct HeapWithBucketsForHamming32<
                         const auto value = min_distances_scalar[j8];
                         const auto index = min_indices_scalar[j8];
 
-                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
+                        if (value < std::numeric_limits<int32_t>::max() &&
+                            C::cmp2(bh_val[0], value, bh_ids[0], index)) {
                             heap_replace_top<C>(
                                     k, bh_val, bh_ids, value, index);
                         }
@@ -168,11 +184,13 @@ struct HeapWithBucketsForHamming32<
             // process leftovers
             for (uint32_t ip = nb; ip < n_per_beam; ip++) {
                 const auto index = ip + n_per_beam * beam_index;
-                const auto value =
-                        hc.hamming(binary_vectors + (index)*code_size);
+                if (!sel || sel->is_member(index)) {
+                    const auto value =
+                            hc.hamming(binary_vectors + (index)*code_size);
 
-                if (C::cmp(bh_val[0], value)) {
-                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                    if (C::cmp(bh_val[0], value)) {
+                        heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                    }
                 }
             }
         }
diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp
index 3136ef9f51..3743e82b69 100644
--- a/faiss/utils/hamming.cpp
+++ b/faiss/utils/hamming.cpp
@@ -30,6 +30,7 @@
 
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
 #include <faiss/utils/utils.h>
@@ -62,15 +63,15 @@ void hammings(
         const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
-        size_t nwords,
+        size_t nbits,
         hamdis_t* __restrict dis) {
     size_t i, j;
-    n1 *= nwords;
-    n2 *= nwords;
-    for (i = 0; i < n1; i += nwords) {
-        const uint64_t* bs1_ = bs1 + i;
-        for (j = 0; j < n2; j += nwords)
-            dis[j] = hamming(bs1_, bs2 + j, nwords);
+    const size_t nwords = nbits / 64;
+    for (i = 0; i < n1; i++) {
+        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
+        hamdis_t* __restrict dis_ = dis + i * n2;
+        for (j = 0; j < n2; j++)
+            dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords);
     }
 }
 
@@ -171,7 +172,8 @@ void hammings_knn_hc(
         size_t n2,
         bool order = true,
         bool init_heap = true,
-        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK) {
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
+        const faiss::IDSelector* sel = nullptr) {
     size_t k = ha->k;
     if (init_heap)
         ha->heapify();
@@ -204,7 +206,7 @@ void hammings_knn_hc(
                 NB,                                                          \
                 BD,                                                          \
                 HammingComputer>::                                           \
-                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_);                \
+                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel);           \
         break;
 
             switch (approx_topk_mode) {
@@ -214,6 +216,9 @@ void hammings_knn_hc(
                 HANDLE_APPROX(32, 2)
                 default: {
                     for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
+                        if (sel && !sel->is_member(j)) {
+                            continue;
+                        }
                         dis = hc.hamming(bs2_);
                         if (dis < bh_val_[0]) {
                             faiss::maxheap_replace_top<hamdis_t>(
@@ -238,7 +243,8 @@ void hammings_knn_mc(
         size_t nb,
         size_t k,
         int32_t* __restrict distances,
-        int64_t* __restrict labels) {
+        int64_t* __restrict labels,
+        const faiss::IDSelector* sel) {
     const int nBuckets = bytes_per_code * 8 + 1;
     std::vector<int> all_counters(na * nBuckets, 0);
     std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
@@ -259,7 +265,9 @@ void hammings_knn_mc(
 #pragma omp parallel for num_threads(num_omp_threads)
         for (int64_t i = 0; i < na; ++i) {
             for (size_t j = j0; j < j1; ++j) {
-                cs[i].update_counter(b + j * bytes_per_code, j);
+                if (!sel || sel->is_member(j)) {
+                    cs[i].update_counter(b + j * bytes_per_code, j);
+                }
             }
         }
     }
@@ -291,7 +299,8 @@ void hamming_range_search(
         size_t nb,
         int radius,
         size_t code_size,
-        RangeSearchResult* res) {
+        RangeSearchResult* res,
+        const faiss::IDSelector* sel) {
 #pragma omp parallel num_threads(num_omp_threads)
     {
         RangeSearchPartialResult pres(res);
@@ -303,9 +312,11 @@ void hamming_range_search(
             RangeQueryResult& qres = pres.new_result(i);
 
             for (size_t j = 0; j < nb; j++) {
-                int dis = hc.hamming(yi);
-                if (dis < radius) {
-                    qres.add(dis, j);
+                if (!sel || sel->is_member(j)) {
+                    int dis = hc.hamming(yi);
+                    if (dis < radius) {
+                        qres.add(dis, j);
+                    }
                 }
                 yi += code_size;
             }
@@ -489,10 +500,21 @@ void hammings_knn_hc(
         size_t nb,
         size_t ncodes,
         int order,
-        ApproxTopK_mode_t approx_topk_mode) {
+        ApproxTopK_mode_t approx_topk_mode,
+        const faiss::IDSelector* sel) {
     Run_hammings_knn_hc r;
     dispatch_HammingComputer(
-            ncodes, r, ncodes, ha, a, b, nb, order, true, approx_topk_mode);
+            ncodes,
+            r,
+            ncodes,
+            ha,
+            a,
+            b,
+            nb,
+            order,
+            true,
+            approx_topk_mode,
+            sel);
 }
 
 void hammings_knn_mc(
@@ -503,10 +525,11 @@ void hammings_knn_mc(
         size_t k,
         size_t ncodes,
         int32_t* __restrict distances,
-        int64_t* __restrict labels) {
+        int64_t* __restrict labels,
+        const faiss::IDSelector* sel) {
     Run_hammings_knn_mc r;
     dispatch_HammingComputer(
-            ncodes, r, ncodes, a, b, na, nb, k, distances, labels);
+            ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel);
 }
 
 void hamming_range_search(
@@ -516,10 +539,11 @@ void hamming_range_search(
         size_t nb,
         int radius,
         size_t code_size,
-        RangeSearchResult* result) {
+        RangeSearchResult* result,
+        const faiss::IDSelector* sel) {
     Run_hamming_range_search r;
     dispatch_HammingComputer(
-            code_size, r, a, b, na, nb, radius, code_size, result);
+            code_size, r, a, b, na, nb, radius, code_size, result, sel);
 }
 
 /* Count number of matches given a max threshold            */
diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h
index 85f9730e5c..3f3f488bc5 100644
--- a/faiss/utils/hamming.h
+++ b/faiss/utils/hamming.h
@@ -27,6 +27,7 @@
 
 #include <stdint.h>
 
+#include <faiss/impl/IDSelector.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
 
@@ -135,7 +136,8 @@ void hammings_knn_hc(
         size_t nb,
         size_t ncodes,
         int ordered,
-        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK);
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
+        const faiss::IDSelector* sel = nullptr);
 
 /* Legacy alias to hammings_knn_hc. */
 void hammings_knn(
@@ -166,7 +168,8 @@ void hammings_knn_mc(
         size_t k,
         size_t ncodes,
         int32_t* distances,
-        int64_t* labels);
+        int64_t* labels,
+        const faiss::IDSelector* sel = nullptr);
 
 /** same as hammings_knn except we are doing a range search with radius */
 void hamming_range_search(
@@ -176,7 +179,8 @@ void hamming_range_search(
         size_t nb,
         int radius,
         size_t ncodes,
-        RangeSearchResult* result);
+        RangeSearchResult* result,
+        const faiss::IDSelector* sel = nullptr);
 
 /* Counting the number of matches or of cross-matches (without returning them)
    For use with function that assume pre-allocated memory */
diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
index 4c9b7c92a1..653b240bc3 100644
--- a/faiss/utils/utils.cpp
+++ b/faiss/utils/utils.cpp
@@ -388,7 +388,7 @@ size_t ranklist_intersection_size(
     return count;
 }
 
-double imbalance_factor(int k, const int* hist) {
+double imbalance_factor(int k, const int64_t* hist) {
     double tot = 0, uf = 0;
 
     for (int i = 0; i < k; i++) {
@@ -400,9 +400,9 @@ double imbalance_factor(int k, const int* hist) {
     return uf;
 }
 
-double imbalance_factor(int n, int k, const int64_t* assign) {
-    std::vector<int> hist(k, 0);
-    for (int i = 0; i < n; i++) {
+double imbalance_factor(int64_t n, int k, const int64_t* assign) {
+    std::vector<int64_t> hist(k, 0);
+    for (int64_t i = 0; i < n; i++) {
         hist[assign[i]]++;
     }
 
diff --git a/faiss/utils/utils.h b/faiss/utils/utils.h
index 901459d1c7..7d75b3200d 100644
--- a/faiss/utils/utils.h
+++ b/faiss/utils/utils.h
@@ -92,10 +92,10 @@ size_t merge_result_table_with(
 
 /// a balanced assignment has a IF of 1, a completely unbalanced assignment has
 /// an IF = k.
-double imbalance_factor(int n, int k, const int64_t* assign);
+double imbalance_factor(int64_t n, int k, const int64_t* assign);
 
 /// same, takes a histogram as input
-double imbalance_factor(int k, const int* hist);
+double imbalance_factor(int k, const int64_t* hist);
 
 /// compute histogram on v
 int ivec_hist(size_t n, const int* v, int vmax, int* hist);
diff --git a/perf_tests/bench_scalar_quantizer_distance.cpp b/perf_tests/bench_scalar_quantizer_distance.cpp
index 14945c58c4..8a32d69c71 100644
--- a/perf_tests/bench_scalar_quantizer_distance.cpp
+++ b/perf_tests/bench_scalar_quantizer_distance.cpp
@@ -23,8 +23,8 @@ DEFINE_uint32(iterations, 20, "iterations");
 static void bench_distance(
         benchmark::State& state,
         ScalarQuantizer::QuantizerType type,
-        int n,
-        int d) {
+        int d,
+        int n) {
     std::vector<float> x(d * n);
 
     float_rand(x.data(), d * n, 12345);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index dfab76e024..285b9090ed 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -36,6 +36,8 @@ set(FAISS_TEST_SRC
   test_callback.cpp
   test_utils.cpp
   test_hamming.cpp
+  test_mmap.cpp
+  test_zerocopy.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
diff --git a/tests/test_factory.py b/tests/test_factory.py
index 4196895540..2246eb8c10 100644
--- a/tests/test_factory.py
+++ b/tests/test_factory.py
@@ -34,7 +34,7 @@ def test_factory_1(self):
         except RuntimeError:
             pass
         else:
-            assert False, "should do a runtime error"
+            raise AssertionError("should do a runtime error")
 
     def test_factory_2(self):
 
@@ -62,6 +62,14 @@ def test_factory_5(self):
         assert index.sa_code_size() == 64 * 4
         assert index.chain.at(0).d_out == 64
 
+    def test_factory_6(self):
+        index = faiss.index_factory(128, "RaBitQ")
+        assert index.d == 128
+        assert index.metric_type == faiss.METRIC_L2
+        index = faiss.index_factory(128, "IVF256,RaBitQ")
+        assert index.d == 128
+        assert index.metric_type == faiss.METRIC_L2
+
     def test_factory_HNSW(self):
         index = faiss.index_factory(12, "HNSW32")
         assert index.storage.sa_code_size() == 12 * 4
diff --git a/tests/test_factory_tools.cpp b/tests/test_factory_tools.cpp
index 2e77645e80..f5dda2ad68 100644
--- a/tests/test_factory_tools.cpp
+++ b/tests/test_factory_tools.cpp
@@ -24,6 +24,8 @@ TEST(TestFactoryTools, TestReverseIndexFactory) {
                  "HNSW32",
                  "SQ8",
                  "SQfp16",
+                 "NSG24,Flat",
+                 "NSG16,SQ8",
          }) {
         std::unique_ptr<Index> index{index_factory(64, factory)};
         ASSERT_TRUE(index);
@@ -32,6 +34,8 @@ TEST(TestFactoryTools, TestReverseIndexFactory) {
     using Case = std::pair<const char*, const char*>;
     for (auto [src, dst] : {
                  Case{"SQ8,RFlat", "SQ8,Refine(Flat)"},
+                 Case{"NSG", "NSG32,Flat"},
+                 Case{"NSG,PQ8", "NSG32,PQ8x8"},
          }) {
         std::unique_ptr<Index> index{index_factory(64, src)};
         ASSERT_TRUE(index);
diff --git a/tests/test_fast_scan_ivf.py b/tests/test_fast_scan_ivf.py
index 75c9500f82..a1d6a21440 100644
--- a/tests/test_fast_scan_ivf.py
+++ b/tests/test_fast_scan_ivf.py
@@ -270,8 +270,9 @@ def test_equiv_pq(self):
         index_pq = faiss.index_factory(32, "PQ16x4np")
         index_pq.pq = index.pq
         index_pq.is_trained = True
-        index_pq.codes = faiss. downcast_InvertedLists(
+        codevec = faiss.downcast_InvertedLists(
             index.invlists).codes.at(0)
+        index_pq.codes = faiss.MaybeOwnedVectorUInt8(codevec)
         index_pq.ntotal = index.ntotal
         Dnew, Inew = index_pq.search(xq, 4)
 
diff --git a/tests/test_hamming.cpp b/tests/test_hamming.cpp
index 423f9736d2..e4815ae93d 100644
--- a/tests/test_hamming.cpp
+++ b/tests/test_hamming.cpp
@@ -17,7 +17,7 @@ template <typename T>
 std::string print_data(
         std::shared_ptr<std::vector<T>> data,
         const size_t divider) {
-    std::string ret = "";
+    std::string ret;
     for (int i = 0; i < data->size(); ++i) {
         if (i % divider) {
             ret += " ";
@@ -38,8 +38,11 @@ std::stringstream get_correct_hamming_example(
         std::shared_ptr<std::vector<uint8_t>> a,
         std::shared_ptr<std::vector<uint8_t>> b,
         std::shared_ptr<std::vector<long long>> true_ids,
-        std::shared_ptr<std::vector<int>> true_distances) {
-    assert(nb > k);
+        // regular Hamming (bit-level distances)
+        std::shared_ptr<std::vector<int>> true_bit_distances,
+        // generalized Hamming (byte-level distances)
+        std::shared_ptr<std::vector<int>> true_byte_distances) {
+    assert(nb >= k);
 
     // Initialization
     std::default_random_engine rng(123);
@@ -51,11 +54,12 @@ std::stringstream get_correct_hamming_example(
     a->resize(na * code_size, 1); // query vectors are all 1
     b->clear();
     b->resize(nb * code_size, 2); // database vectors are all 2
-
     true_ids->clear();
     true_ids->reserve(nresults);
-    true_distances->clear();
-    true_distances->reserve(nresults);
+    true_bit_distances->clear();
+    true_bit_distances->reserve(nresults);
+    true_byte_distances->clear();
+    true_byte_distances->reserve(nresults);
 
     // define correct ids (must be unique)
     std::set<long> correct_ids;
@@ -72,21 +76,32 @@ std::stringstream get_correct_hamming_example(
 
         // assemble true id and distance at locations
         true_ids->push_back(id);
-        true_distances->push_back(code_size - nmatches); // hamming dist
+        true_bit_distances->push_back(
+                (code_size > nmatches ? code_size - nmatches : 0) *
+                /* per-code distance between 1 and 2 (0b01 and 0b10) */
+                2);
+        true_byte_distances->push_back(
+                (code_size > nmatches ? code_size - nmatches : 0));
         for (size_t i = 0; i < nmatches; ++i) {
-            b->begin()[id * code_size + i] = 1;
+            b->begin()[id * code_size + i] = 1; // query byte value
         }
     }
 
-    // true_ids and true_distances only contain results for the first query
-    // each query is identical, so copy the first query na-1 times
+    // true_ids, true_bit_distances, true_byte_distances only contain results
+    // for the first query.
+    // Query vectors are identical (all 1s), so copy the first sets of k
+    // distances na-1 times.
     for (size_t i = 1; i < na; ++i) {
         true_ids->insert(
                 true_ids->end(), true_ids->begin(), true_ids->begin() + k);
-        true_distances->insert(
-                true_distances->end(),
-                true_distances->begin(),
-                true_distances->begin() + k);
+        true_bit_distances->insert(
+                true_bit_distances->end(),
+                true_bit_distances->begin(),
+                true_bit_distances->begin() + k);
+        true_byte_distances->insert(
+                true_byte_distances->end(),
+                true_byte_distances->begin(),
+                true_byte_distances->begin() + k);
     }
 
     // assemble string for debugging
@@ -98,7 +113,10 @@ std::stringstream get_correct_hamming_example(
         << "a: " << print_data(a, code_size) << std::endl
         << "b: " << print_data(b, code_size) << std::endl
         << "true_ids: " << print_data(true_ids, k) << std::endl
-        << "true_distances: " << print_data(true_distances, k) << std::endl;
+        << "true_bit_distances: " << print_data(true_bit_distances, k)
+        << std::endl
+        << "true_byte_distances: " << print_data(true_byte_distances, k)
+        << std::endl;
     return ret;
 }
 
@@ -261,14 +279,23 @@ TEST(TestHamming, test_hamming_knn) {
     auto a = std::make_shared<std::vector<uint8_t>>();
     auto b = std::make_shared<std::vector<uint8_t>>();
     auto true_ids = std::make_shared<std::vector<long long>>();
-    auto true_distances = std::make_shared<std::vector<int>>();
+    auto true_bit_distances = std::make_shared<std::vector<int>>();
+    auto true_byte_distances = std::make_shared<std::vector<int>>();
 
     // 8, 16, 32 are cases - 24 will hit default case
     // all should be multiples of 8
     for (auto code_size : {8, 16, 24, 32}) {
         // get example
         std::stringstream assert_str = get_correct_hamming_example(
-                na, nb, k, code_size, a, b, true_ids, true_distances);
+                na,
+                nb,
+                k,
+                code_size,
+                a,
+                b,
+                true_ids,
+                true_bit_distances,
+                true_byte_distances);
 
         // run test on generalized_hammings_knn_hc
         std::vector<long long> ids_gen(na * k);
@@ -278,7 +305,7 @@ TEST(TestHamming, test_hamming_knn) {
         faiss::generalized_hammings_knn_hc(
                 &res, a->data(), b->data(), nb, code_size, true);
         ASSERT_EQ(ids_gen, *true_ids) << assert_str.str();
-        ASSERT_EQ(dist_gen, *true_distances) << assert_str.str();
+        ASSERT_EQ(dist_gen, *true_byte_distances) << assert_str.str();
 
         // run test on hammings_knn
         std::vector<long long> ids_ham_knn(na * k, 0);
@@ -286,10 +313,23 @@ TEST(TestHamming, test_hamming_knn) {
         res = {na, k, ids_ham_knn.data(), dist_ham_knn.data()};
         faiss::hammings_knn(&res, a->data(), b->data(), nb, code_size, true);
         ASSERT_EQ(ids_ham_knn, *true_ids) << assert_str.str();
-        // hammings_knn results in twice the distance for some reason :/
-        for (int i = 0; i < dist_ham_knn.size(); ++i) {
-            dist_ham_knn[i] /= 2;
-        }
-        ASSERT_EQ(dist_ham_knn, *true_distances) << assert_str.str();
+        ASSERT_EQ(dist_ham_knn, *true_bit_distances) << assert_str.str();
+    }
+
+    for (auto code_size : {8, 16, 24, 32}) {
+        std::stringstream assert_str = get_correct_hamming_example(
+                na,
+                nb,
+                /* k */ nb, // faiss::hammings computes all distances
+                code_size,
+                a,
+                b,
+                true_ids,
+                true_bit_distances,
+                true_byte_distances);
+        std::vector<hamdis_t> dist_gen(na * nb);
+        faiss::hammings(
+                a->data(), b->data(), na, nb, code_size, dist_gen.data());
+        EXPECT_EQ(dist_gen, *true_bit_distances) << assert_str.str();
     }
 }
diff --git a/tests/test_hnsw.cpp b/tests/test_hnsw.cpp
index b3c93a861e..9c33c08a9e 100644
--- a/tests/test_hnsw.cpp
+++ b/tests/test_hnsw.cpp
@@ -193,6 +193,27 @@ TEST(HNSW, Test_popmin_infinite_distances) {
     }
 }
 
+TEST(HNSW, Test_IndexHNSW_METRIC_Lp) {
+    // Create an HNSW index with METRIC_Lp and metric_arg = 3
+    faiss::IndexFlat storage_index(1, faiss::METRIC_Lp);
+    storage_index.metric_arg = 3;
+    faiss::IndexHNSW index(&storage_index, 32);
+
+    // Add a single data point
+    float data[1] = {0.0};
+    index.add(1, data);
+
+    // Prepare a query
+    float query[1] = {2.0};
+    float distance;
+    faiss::idx_t label;
+
+    index.search(1, query, 1, &distance, &label);
+
+    EXPECT_NEAR(distance, 8.0, 1e-5); // Distance should be 8.0 (2^3)
+    EXPECT_EQ(label, 0);              // Label should be 0
+}
+
 class HNSWTest : public testing::Test {
    protected:
     HNSWTest() {
@@ -582,6 +603,16 @@ TEST_F(HNSWTest, TEST_search_neighbors_to_add) {
     }
 }
 
+TEST_F(HNSWTest, TEST_nb_neighbors_bound) {
+    omp_set_num_threads(1);
+    EXPECT_EQ(index->hnsw.nb_neighbors(0), 8);
+    EXPECT_EQ(index->hnsw.nb_neighbors(1), 4);
+    EXPECT_EQ(index->hnsw.nb_neighbors(2), 4);
+    EXPECT_EQ(index->hnsw.nb_neighbors(3), 4);
+    // picking a large number to trigger an exception based on checking bounds
+    EXPECT_THROW(index->hnsw.nb_neighbors(100), faiss::FaissException);
+}
+
 TEST_F(HNSWTest, TEST_search_level_0) {
     omp_set_num_threads(1);
     std::vector<faiss::idx_t> I(k * nq);
diff --git a/tests/test_io.py b/tests/test_io.py
index 3cbd0a6e10..e2c5e69a18 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -11,6 +11,7 @@
 import io
 import sys
 import pickle
+import platform
 from multiprocessing.pool import ThreadPool
 from common_faiss_tests import get_dataset_2
 
@@ -481,3 +482,53 @@ def test_reader(self):
         finally:
             if os.path.exists(fname):
                 os.unlink(fname)
+
+
+
+class TestIOFlatMMap(unittest.TestCase):
+    @unittest.skipIf(
+        platform.system() not in ["Windows", "Linux"],
+        "supported OSes only"
+    )
+    def test_mmap(self): 
+        xt, xb, xq = get_dataset_2(32, 0, 100, 50)
+        index = faiss.index_factory(32, "SQfp16", faiss.METRIC_L2)
+        # does not need training 
+        index.add(xb)
+        Dref, Iref = index.search(xq, 10)
+
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+
+        index2 = None
+        try:
+            faiss.write_index(index, fname)
+            index2 = faiss.read_index(fname, faiss.IO_FLAG_MMAP_IFC)
+            Dnew, Inew = index2.search(xq, 10)
+            np.testing.assert_array_equal(Iref, Inew)
+            np.testing.assert_array_equal(Dref, Dnew)
+        finally:
+            del index2
+
+            if os.path.exists(fname):
+                # skip the error. On Windows, index2 holds the handle file, 
+                #   so it cannot be ensured that the file can be deleted
+                #   unless index2 is collected by a GC
+                try:
+                    os.unlink(fname)
+                except:
+                    pass
+
+    def test_zerocopy(self): 
+        xt, xb, xq = get_dataset_2(32, 0, 100, 50)
+        index = faiss.index_factory(32, "SQfp16", faiss.METRIC_L2)
+        # does not need training 
+        index.add(xb)
+        Dref, Iref = index.search(xq, 10)
+
+        serialized_index = faiss.serialize_index(index)
+        reader = faiss.ZeroCopyIOReader(faiss.swig_ptr(serialized_index), serialized_index.size)
+        index2 = faiss.read_index(reader)
+        Dnew, Inew = index2.search(xq, 10)
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_equal(Dref, Dnew)
diff --git a/tests/test_ivflib.py b/tests/test_ivflib.py
index d905f3d486..4bcad0c0c5 100644
--- a/tests/test_ivflib.py
+++ b/tests/test_ivflib.py
@@ -8,6 +8,9 @@
 import unittest
 import faiss
 import numpy as np
+import os
+import random
+
 
 class TestIVFlib(unittest.TestCase):
 
@@ -180,3 +183,191 @@ def test_small_data(self):
         assert np.all(lims == ref_lims)
         assert np.all(D == ref_D)
         assert np.all(I == ref_I)
+
+
+class TestIvfSharding(unittest.TestCase):
+    d = 32
+    nlist = 100
+    nb = 1000
+
+    def custom_sharding_function(self, i, _):
+        return 1 if i % 2 == 0 else 7
+
+    # Mimics the default in DefaultShardingFunction.
+    # This impl is just used for verification.
+    def default_sharding_function(self, i, shard_count):
+        return i % shard_count
+
+    def verify_sharded_ivf_indexes(
+            self, template, xb, shard_count, sharding_function, generate_ids=True):
+        sharded_indexes_counters = [0] * shard_count
+        sharded_indexes = []
+        for i in range(shard_count):
+            if xb[0].dtype.name == 'uint8':
+                index = faiss.read_index_binary(template % i)
+            else:
+                index = faiss.read_index(template % i)
+            sharded_indexes.append(index)
+
+        # Reconstruct and verify each centroid
+        if generate_ids:
+            for i in range(len(xb)):
+                shard_id = sharding_function(i, shard_count)
+                reconstructed = sharded_indexes[shard_id].quantizer.reconstruct(i)
+                np.testing.assert_array_equal(reconstructed, xb[i])
+        else:
+            for i in range(len(xb)):
+                shard_id = sharding_function(i, shard_count)
+                reconstructed = sharded_indexes[shard_id].quantizer.reconstruct(
+                    sharded_indexes_counters[shard_id])
+                sharded_indexes_counters[shard_id] += 1
+                np.testing.assert_array_equal(reconstructed, xb[i])
+
+        # Clean up
+        for i in range(shard_count):
+            os.remove(template % i)
+
+    def test_save_index_shards_by_centroids_no_op(self):
+        quantizer = faiss.IndexFlatL2(self.d)
+        index = faiss.IndexIVFFlat(quantizer, self.d, self.nlist)
+        with self.assertRaises(RuntimeError):
+            faiss.shard_ivf_index_centroids(
+                index,
+                10,
+                "shard.%d.index",
+                None
+            )
+
+    def test_save_index_shards_by_centroids_flat_quantizer_default_sharding(
+            self):
+        xb = np.random.rand(self.nb, self.d).astype('float32')
+        quantizer = faiss.IndexFlatL2(self.d)
+        index = faiss.IndexIVFFlat(quantizer, self.d, self.nlist)
+        shard_count = 3
+
+        index.quantizer.add(xb)
+
+        template = str(random.randint(0, 100000)) + "shard.%d.index"
+        faiss.shard_ivf_index_centroids(
+            index,
+            shard_count,
+            template,
+            None,
+            True
+        )
+        self.verify_sharded_ivf_indexes(
+            template, xb, shard_count, self.default_sharding_function)
+
+    def test_save_index_shards_by_centroids_flat_quantizer_custom_sharding(
+            self):
+        xb = np.random.rand(self.nb, self.d).astype('float32')
+        quantizer = faiss.IndexFlatL2(self.d)
+        index = faiss.IndexIVFFlat(quantizer, self.d, self.nlist)
+        shard_count = 20
+
+        index.quantizer.add(xb)
+
+        template = str(random.randint(0, 100000)) + "shard.%d.index"
+        faiss.shard_ivf_index_centroids(
+            index,
+            shard_count,
+            template,
+            self.custom_sharding_function,
+            True
+        )
+        self.verify_sharded_ivf_indexes(
+            template, xb, shard_count, self.custom_sharding_function)
+
+    def test_save_index_shards_by_centroids_hnsw_quantizer(self):
+        xb = np.random.rand(self.nb, self.d).astype('float32')
+        quantizer = faiss.IndexHNSWFlat(self.d, 32)
+        index = faiss.IndexIVFFlat(quantizer, self.d, self.nlist)
+        shard_count = 17
+
+        index.quantizer.add(xb)
+
+        template = str(random.randint(0, 100000)) + "shard.%d.index"
+        faiss.shard_ivf_index_centroids(
+            index,
+            shard_count,
+            template,
+            None,
+            True
+        )
+        self.verify_sharded_ivf_indexes(
+            template, xb, shard_count, self.default_sharding_function)
+
+    def test_save_index_shards_by_centroids_binary_flat_quantizer(self):
+        xb = np.random.randint(256, size=(self.nb, int(self.d / 8))).astype('uint8')
+        quantizer = faiss.IndexBinaryFlat(self.d)
+        index = faiss.IndexBinaryIVF(quantizer, self.d, self.nlist)
+        shard_count = 11
+
+        index.quantizer.add(xb)
+
+        template = str(random.randint(0, 100000)) + "shard.%d.index"
+        faiss.shard_binary_ivf_index_centroids(
+            index,
+            shard_count,
+            template,
+            None,
+            True
+        )
+        self.verify_sharded_ivf_indexes(
+            template, xb, shard_count, self.default_sharding_function)
+
+    def test_save_index_shards_by_centroids_binary_hnsw_quantizer(self):
+        xb = np.random.randint(256, size=(self.nb, int(self.d / 8))).astype('uint8')
+        quantizer = faiss.IndexBinaryHNSW(self.d, 32)
+        index = faiss.IndexBinaryIVF(quantizer, self.d, self.nlist)
+        shard_count = 13
+
+        index.quantizer.add(xb)
+
+        template = str(random.randint(0, 100000)) + "shard.%d.index"
+        faiss.shard_binary_ivf_index_centroids(
+            index,
+            shard_count,
+            template,
+            None,
+            True
+        )
+        self.verify_sharded_ivf_indexes(
+            template, xb, shard_count, self.default_sharding_function)
+
+    def test_save_index_shards_without_id_generation(self):
+        xb = np.random.randint(256, size=(self.nb, int(self.d / 8))).astype('uint8')
+        quantizer = faiss.IndexBinaryHNSW(self.d, 32)
+        index = faiss.IndexBinaryIVF(quantizer, self.d, self.nlist)
+        shard_count = 5
+
+        index.quantizer.add(xb)
+
+        template = str(random.randint(0, 100000)) + "shard.%d.index"
+        faiss.shard_binary_ivf_index_centroids(
+            index,
+            shard_count,
+            template,
+            None,
+            False
+        )
+        self.verify_sharded_ivf_indexes(
+            template, xb, shard_count, self.default_sharding_function, False)
+
+        xb = np.random.rand(self.nb, self.d).astype('float32')
+        quantizer = faiss.IndexHNSWFlat(self.d, 32)
+        index = faiss.IndexIVFFlat(quantizer, self.d, self.nlist)
+        shard_count = 23
+
+        index.quantizer.add(xb)
+
+        template = str(random.randint(0, 100000)) + "shard.%d.index"
+        faiss.shard_ivf_index_centroids(
+            index,
+            shard_count,
+            template,
+            None,
+            False
+        )
+        self.verify_sharded_ivf_indexes(
+            template, xb, shard_count, self.default_sharding_function, False)
diff --git a/tests/test_mmap.cpp b/tests/test_mmap.cpp
new file mode 100644
index 0000000000..78549d6878
--- /dev/null
+++ b/tests/test_mmap.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/io.h>
+#include <faiss/index_io.h>
+
+namespace {
+
+std::vector<float> make_data(const size_t n, const size_t d, size_t seed) {
+    std::vector<float> database(n * d);
+    std::mt19937 rng(seed);
+    std::uniform_real_distribution<float> distrib;
+
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = distrib(rng);
+    }
+    return database;
+}
+
+std::vector<uint8_t> make_binary_data(
+        const size_t n,
+        const size_t d,
+        size_t seed) {
+    std::vector<uint8_t> database(n * d);
+    std::mt19937 rng(seed);
+    std::uniform_int_distribution<uint8_t> distrib(0, 255);
+
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = distrib(rng);
+    }
+    return database;
+}
+
+} // namespace
+
+// the logic is the following:
+//   1. generate two flatcodes-based indices, Index1 and Index2
+//   2. serialize both indices into std::vector<> buffers, Buf1 and Buf2
+//   3. save Buf1 into a temporary file, File1
+//   4. deserialize Index1 using mmap feature on File1 into Index1MM
+//   5. ensure that Index1MM acts as Index2 if we write the data from Buf2
+//      on top of the existing File1
+//   6. ensure that Index1MM acts as Index1 if we write the data from Buf1
+//      on top of the existing File1 again
+
+TEST(TestMmap, mmap_flatcodes) {
+#ifdef _AIX
+    GTEST_SKIP() << "Skipping test on AIX.";
+#endif
+    // generate data
+    const size_t nt = 1000;
+    const size_t nq = 10;
+    const size_t d = 32;
+    const size_t k = 25;
+
+    std::vector<float> xt1 = make_data(nt, d, 123);
+    std::vector<float> xt2 = make_data(nt, d, 456);
+    std::vector<float> xq = make_data(nq, d, 789);
+
+    // ensure that the data is different
+    ASSERT_NE(xt1, xt2);
+
+    // make index1 and create reference results
+    faiss::IndexFlatL2 index1(d);
+    index1.train(nt, xt1.data());
+    index1.add(nt, xt1.data());
+
+    std::vector<float> ref_dis_1(k * nq);
+    std::vector<faiss::idx_t> ref_ids_1(k * nq);
+    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
+
+    // make index2 and create reference results
+    faiss::IndexFlatL2 index2(d);
+    index2.train(nt, xt2.data());
+    index2.add(nt, xt2.data());
+
+    std::vector<float> ref_dis_2(k * nq);
+    std::vector<faiss::idx_t> ref_ids_2(k * nq);
+    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
+
+    // ensure that the results are different
+    ASSERT_NE(ref_dis_1, ref_dis_2);
+    ASSERT_NE(ref_ids_1, ref_ids_2);
+
+    // serialize both in a form of vectors
+    faiss::VectorIOWriter wr1;
+    faiss::write_index(&index1, &wr1);
+
+    faiss::VectorIOWriter wr2;
+    faiss::write_index(&index2, &wr2);
+
+    // generate a temporary file and write index1 into it
+    std::string tmpname = std::tmpnam(nullptr);
+
+    {
+        std::ofstream ofs(tmpname);
+        ofs.write((const char*)wr1.data.data(), wr1.data.size());
+    }
+
+    // create a mmap index
+    std::unique_ptr<faiss::Index> index1mm(
+            faiss::read_index(tmpname.c_str(), faiss::IO_FLAG_MMAP_IFC));
+
+    ASSERT_NE(index1mm, nullptr);
+
+    // perform a search
+    std::vector<float> cand_dis_1(k * nq);
+    std::vector<faiss::idx_t> cand_ids_1(k * nq);
+    index1mm->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_1);
+    ASSERT_EQ(ref_dis_1, cand_dis_1);
+
+    // ok now, overwrite the internals of the file without recreating it
+    {
+        std::ofstream ofs(tmpname);
+        ofs.seekp(0, std::ios::beg);
+
+        ofs.write((const char*)wr2.data.data(), wr2.data.size());
+    }
+
+    // perform a search
+    std::vector<float> cand_dis_2(k * nq);
+    std::vector<faiss::idx_t> cand_ids_2(k * nq);
+    index1mm->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_2, cand_ids_2);
+    ASSERT_EQ(ref_dis_2, cand_dis_2);
+
+    // write back data1
+    {
+        std::ofstream ofs(tmpname);
+        ofs.seekp(0, std::ios::beg);
+
+        ofs.write((const char*)wr1.data.data(), wr1.data.size());
+    }
+
+    // perform a search
+    std::vector<float> cand_dis_3(k * nq);
+    std::vector<faiss::idx_t> cand_ids_3(k * nq);
+    index1mm->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_3);
+    ASSERT_EQ(ref_dis_1, cand_dis_3);
+}
+
+TEST(TestMmap, mmap_binary_flatcodes) {
+#ifdef _AIX
+    GTEST_SKIP() << "Skipping test on AIX.";
+#endif
+    // generate data
+    const size_t nt = 1000;
+    const size_t nq = 10;
+    // in bits
+    const size_t d = 64;
+    // in bytes
+    const size_t d8 = (d + 7) / 8;
+    const size_t k = 25;
+
+    std::vector<uint8_t> xt1 = make_binary_data(nt, d8, 123);
+    std::vector<uint8_t> xt2 = make_binary_data(nt, d8, 456);
+    std::vector<uint8_t> xq = make_binary_data(nq, d8, 789);
+
+    // ensure that the data is different
+    ASSERT_NE(xt1, xt2);
+
+    // make index1 and create reference results
+    faiss::IndexBinaryFlat index1(d);
+    index1.train(nt, xt1.data());
+    index1.add(nt, xt1.data());
+
+    std::vector<int32_t> ref_dis_1(k * nq);
+    std::vector<faiss::idx_t> ref_ids_1(k * nq);
+    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
+
+    // make index2 and create reference results
+    faiss::IndexBinaryFlat index2(d);
+    index2.train(nt, xt2.data());
+    index2.add(nt, xt2.data());
+
+    std::vector<int32_t> ref_dis_2(k * nq);
+    std::vector<faiss::idx_t> ref_ids_2(k * nq);
+    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
+
+    // ensure that the results are different
+    ASSERT_NE(ref_dis_1, ref_dis_2);
+    ASSERT_NE(ref_ids_1, ref_ids_2);
+
+    // serialize both in a form of vectors
+    faiss::VectorIOWriter wr1;
+    faiss::write_index_binary(&index1, &wr1);
+
+    faiss::VectorIOWriter wr2;
+    faiss::write_index_binary(&index2, &wr2);
+
+    // generate a temporary file and write index1 into it
+    std::string tmpname = std::tmpnam(nullptr);
+
+    {
+        std::ofstream ofs(tmpname);
+        ofs.write((const char*)wr1.data.data(), wr1.data.size());
+    }
+
+    // create a mmap index
+    std::unique_ptr<faiss::IndexBinary> index1mm(
+            faiss::read_index_binary(tmpname.c_str(), faiss::IO_FLAG_MMAP_IFC));
+
+    ASSERT_NE(index1mm, nullptr);
+
+    // perform a search
+    std::vector<int32_t> cand_dis_1(k * nq);
+    std::vector<faiss::idx_t> cand_ids_1(k * nq);
+    index1mm->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_1);
+    ASSERT_EQ(ref_dis_1, cand_dis_1);
+
+    // ok now, overwrite the internals of the file without recreating it
+    {
+        std::ofstream ofs(tmpname);
+        ofs.seekp(0, std::ios::beg);
+
+        ofs.write((const char*)wr2.data.data(), wr2.data.size());
+    }
+
+    // perform a search
+    std::vector<int32_t> cand_dis_2(k * nq);
+    std::vector<faiss::idx_t> cand_ids_2(k * nq);
+    index1mm->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_2, cand_ids_2);
+    ASSERT_EQ(ref_dis_2, cand_dis_2);
+
+    // write back data1
+    {
+        std::ofstream ofs(tmpname);
+        ofs.seekp(0, std::ios::beg);
+
+        ofs.write((const char*)wr1.data.data(), wr1.data.size());
+    }
+
+    // perform a search
+    std::vector<int32_t> cand_dis_3(k * nq);
+    std::vector<faiss::idx_t> cand_ids_3(k * nq);
+    index1mm->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_3);
+    ASSERT_EQ(ref_dis_1, cand_dis_3);
+}
diff --git a/tests/test_rabitq.py b/tests/test_rabitq.py
new file mode 100644
index 0000000000..4ad0a0bcd4
--- /dev/null
+++ b/tests/test_rabitq.py
@@ -0,0 +1,445 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+from faiss.contrib import datasets
+
+
+def random_rotation(d, seed=123):
+    rs = np.random.RandomState(seed)
+    Q, _ = np.linalg.qr(rs.randn(d, d))
+    return Q
+
+
+# based on https://gist.github.com/mdouze/0b2386c31d7fb8b20ae04f3fcbbf4d9d
+class ReferenceRabitQ:
+    """Exact translation of the paper
+    https://dl.acm.org/doi/pdf/10.1145/3654970
+    This is both a quantizer and serves to store the codes
+    """
+
+    def __init__(self, d, Bq=4):
+        self.d = d
+        self.Bq = Bq
+
+    def train(self, xtrain, P):
+        self.centroid = xtrain.mean(0)
+        self.P = P
+
+    def rotation(self, x):
+        return x @ self.P
+
+    def inv_rotation(self, x):
+        return x @ self.P.T
+
+    def add(self, Or):
+        # centering & normalization
+        Orc = Or - self.centroid
+        self.O_norms = np.sqrt((Orc**2).sum(1))  # need to store the norms
+        O = Orc / self.O_norms[:, None]
+
+        # 3.1.3
+        self.Xbarb = (self.inv_rotation(Orc) > 0).astype("int8")  # 0, 1
+        # here the encoded vectors are stored as an int array for simplicity
+        # but in the real code it would be as a packed uint8 array
+        # self.Xbarb = np.packbits(self.inv_rotation(Orc) > 0, axis=1)
+        # reconstruct to compute <o, obar>
+        Obar = self.rotation((2 * self.Xbarb - 1) / np.sqrt(self.d))
+        self.o_Obar = (O * Obar).sum(1)  # store dot products
+
+    def distances(self, Qr):
+        """compute distance estimates for the queries to the stored vectors"""
+        d = self.d
+        Bq = self.Bq
+
+        # preproc Qr
+        Qrc = Qr - self.centroid
+        Qrc_norms = np.sqrt((Qrc**2).sum(1))[:, None]
+        Q = Qrc
+        Qprime = self.inv_rotation(Q)
+
+        # quantize queries to Bq bits
+        mins, maxes = Qprime.min(axis=1)[:, None], Qprime.max(axis=1)[:, None]
+        Delta = (maxes - mins) / (2**Bq - 1)
+
+        # article mentioned a randomized variant
+        # qbar = np.floor((Qprime - mins) / Delta + rs.rand(nq, d))
+
+        # we'll use a non-randomized for the comparison purposes
+        qbar = np.round((Qprime - mins) / Delta)
+        # in the real implementation, this would be re-ordered
+        # in least-to most-significant bit
+        # dot product matrix, integers -- this is the expensive operation
+        dp = (qbar[:, None, :] * self.Xbarb[None, :, :]).sum(2)
+
+        # the operations below roll back the normalizations to get the distance
+        # estimates. it is likely that they could be merged
+        # or some of them could be left out because we are interested only
+        # in top-k compute <xbar, qbar> (eq 19-20)
+        sum_X = self.Xbarb.sum(1)
+        sum_Q = qbar.sum(1)[:, None]
+        sD = np.sqrt(d)
+        xbar_qbar = 2 * Delta / sD * dp
+        xbar_qbar += 2 * mins / sD * sum_X
+        xbar_qbar -= Delta / sD * sum_Q
+        xbar_qbar -= sD * mins
+
+        # <xbar, qbar> is close to <xbar, q'> thm 3.3
+        # <xbar, q'> = <obar, q>  eq 17
+
+        # <obar, q> / <obar, o> estimates <q, o> (thm 3.2)
+        q_o = xbar_qbar / self.o_Obar
+
+        # eq 1-2 to de-normalize and get distances
+        dis2_q_o = self.O_norms**2 + Qrc_norms**2 - 2 * self.O_norms * q_o
+
+        return dis2_q_o
+
+
+class ReferenceIVFRabitQ:
+    """straightforward IVF implementation"""
+
+    def __init__(self, d, nlist, Bq=4):
+        self.d = d
+        self.nlist = nlist
+        self.invlists = [ReferenceRabitQ(d, Bq) for _ in range(nlist)]
+        self.quantizer = None
+        self.nprobe = 1
+
+    def train(self, xtrain, P):
+        if self.quantizer is None:
+            km = faiss.Kmeans(self.d, self.nlist, niter=10)
+            km.train(xtrain)
+            centroids = km.centroids
+            self.quantizer = faiss.IndexFlatL2(self.d)
+            self.quantizer.add(centroids)
+        else:
+            centroids = self.quantizer.reconstruct_n()
+        # Override the RabitQ train() to use a common random rotation
+        #  and force centroids from the coarse quantizer
+        for list_no, rq in enumerate(self.invlists):
+            rq.centroid = centroids[list_no]
+            rq.P = P
+
+    def add(self, x):
+        _, keys = self.quantizer.search(x, 1)
+        keys = keys.ravel()
+        n_per_invlist = np.bincount(keys, minlength=self.nlist)
+        order = np.argsort(keys)
+        i0 = 0
+        for list_no, rab in enumerate(self.invlists):
+            i1 = i0 + n_per_invlist[list_no]
+            rab.list_size = i1 - i0
+            if i1 > i0:
+                ids = order[i0:i1]
+                rab.ids = ids
+                rab.add(x[ids])
+            i0 = i1
+
+    def search(self, x, k):
+        nq = len(x)
+        nprobe = self.nprobe
+        D = np.zeros((nq, k), dtype="float32")
+        I = np.zeros((nq, k), dtype=int)
+        D[:] = np.nan
+        I[:] = -1
+        _, Ic = self.quantizer.search(x, nprobe)
+
+        for qno, xq in enumerate(x):
+            # naive top-k implemetation with a full sort
+            q_dis = []
+            q_ids = []
+            for probe in range(nprobe):
+                rab = self.invlists[Ic[qno, probe]]
+                if rab.list_size == 0:
+                    continue
+                # we cannot exploit the batch version
+                # of the queries (in this form)
+                dis = rab.distances(xq[None, :])
+                q_ids.append(rab.ids)
+                q_dis.append(dis.ravel())
+            q_dis = np.hstack(q_dis)
+            q_ids = np.hstack(q_ids)
+            o = q_dis.argsort()
+            kq = min(k, len(q_dis))
+            D[qno, :kq] = q_dis[o[:kq]]
+            I[qno, :kq] = q_ids[o[:kq]]
+        return D, I
+
+
+class TestRaBitQ(unittest.TestCase):
+    def do_comparison_vs_pq_test(self, metric_type=faiss.METRIC_L2):
+        ds = datasets.SyntheticDataset(128, 4096, 4096, 100)
+        k = 10
+
+        # PQ 8-to-1
+        index_pq = faiss.IndexPQ(ds.d, 16, 8, metric_type)
+        index_pq.train(ds.get_train())
+        index_pq.add(ds.get_database())
+        _, I_pq = index_pq.search(ds.get_queries(), k)
+
+        index_rbq = faiss.IndexRaBitQ(ds.d, metric_type)
+        index_rbq.train(ds.get_train())
+        index_rbq.add(ds.get_database())
+        _, I_rbq = index_rbq.search(ds.get_queries(), k)
+
+        # try quantized query
+        rbq_params = faiss.RaBitQSearchParameters(qb=8)
+        _, I_rbq_q8 = index_rbq.search(ds.get_queries(), k, params=rbq_params)
+
+        rbq_params = faiss.RaBitQSearchParameters(qb=4)
+        _, I_rbq_q4 = index_rbq.search(ds.get_queries(), k, params=rbq_params)
+
+        index_flat = faiss.IndexFlat(ds.d, metric_type)
+        index_flat.train(ds.get_train())
+        index_flat.add(ds.get_database())
+        _, I_f = index_flat.search(ds.get_queries(), k)
+
+        # ensure that RaBitQ and PQ are relatively close
+        eval_pq = faiss.eval_intersection(I_pq[:, :k], I_f[:, :k])
+        eval_pq /= ds.nq * k
+        eval_rbq = faiss.eval_intersection(I_rbq[:, :k], I_f[:, :k])
+        eval_rbq /= ds.nq * k
+        eval_rbq_q8 = faiss.eval_intersection(I_rbq_q8[:, :k], I_f[:, :k])
+        eval_rbq_q8 /= ds.nq * k
+        eval_rbq_q4 = faiss.eval_intersection(I_rbq_q4[:, :k], I_f[:, :k])
+        eval_rbq_q4 /= ds.nq * k
+
+        print(
+            f"PQ is {eval_pq}, "
+            f"RaBitQ is {eval_rbq}, "
+            f"q8 RaBitQ is {eval_rbq_q8}, "
+            f"q4 RaBitQ is {eval_rbq_q4}"
+        )
+
+        np.testing.assert_(abs(eval_pq - eval_rbq) < 0.05)
+        np.testing.assert_(abs(eval_pq - eval_rbq_q8) < 0.05)
+        np.testing.assert_(abs(eval_pq - eval_rbq_q4) < 0.05)
+        np.testing.assert_(eval_pq > 0.55)
+
+    def test_comparison_vs_pq_L2(self):
+        self.do_comparison_vs_pq_test(faiss.METRIC_L2)
+
+    def test_comparison_vs_pq_IP(self):
+        self.do_comparison_vs_pq_test(faiss.METRIC_INNER_PRODUCT)
+
+    def test_comparison_vs_ref_L2_rrot(self, rrot_seed=123):
+        ds = datasets.SyntheticDataset(128, 4096, 4096, 1)
+
+        ref_rbq = ReferenceRabitQ(ds.d, Bq=8)
+        ref_rbq.train(ds.get_train(), random_rotation(ds.d, rrot_seed))
+        ref_rbq.add(ds.get_database())
+
+        index_rbq = faiss.IndexRaBitQ(ds.d, faiss.METRIC_L2)
+        index_rbq.qb = 8
+
+        # wrap with random rotations
+        rrot = faiss.RandomRotationMatrix(ds.d, ds.d)
+        rrot.init(rrot_seed)
+
+        index_cand = faiss.IndexPreTransform(rrot, index_rbq)
+        index_cand.train(ds.get_train())
+        index_cand.add(ds.get_database())
+
+        ref_dis = ref_rbq.distances(ds.get_queries())
+
+        dc = index_cand.get_distance_computer()
+        xq = ds.get_queries()
+
+        # ensure that the correlation coefficient is very high
+        dc_dist = [0] * ds.nb
+
+        dc.set_query(faiss.swig_ptr(xq[0]))
+        for j in range(ds.nb):
+            dc_dist[j] = dc(j)
+
+        corr = np.corrcoef(dc_dist, ref_dis[0])[0, 1]
+        print(corr)
+        np.testing.assert_(corr > 0.9)
+
+    def test_comparison_vs_ref_L2(self):
+        ds = datasets.SyntheticDataset(128, 4096, 4096, 1)
+
+        ref_rbq = ReferenceRabitQ(ds.d, Bq=8)
+        ref_rbq.train(ds.get_train(), np.identity(ds.d))
+        ref_rbq.add(ds.get_database())
+
+        index_rbq = faiss.IndexRaBitQ(ds.d, faiss.METRIC_L2)
+        index_rbq.qb = 8
+        index_rbq.train(ds.get_train())
+        index_rbq.add(ds.get_database())
+
+        ref_dis = ref_rbq.distances(ds.get_queries())
+
+        dc = index_rbq.get_distance_computer()
+        xq = ds.get_queries()
+
+        dc.set_query(faiss.swig_ptr(xq[0]))
+        for j in range(ds.nb):
+            upd_dis = dc(j)
+            # print(f"{j} {ref_dis[0][j]} {upd_dis}")
+            np.testing.assert_(abs(ref_dis[0][j] - upd_dis) < 0.001)
+
+    def do_test_serde(self, description):
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+
+        index = faiss.index_factory(ds.d, description)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        b = faiss.serialize_index(index)
+        index2 = faiss.deserialize_index(b)
+
+        Dnew, Inew = index2.search(ds.get_queries(), 10)
+
+        np.testing.assert_equal(Dref, Dnew)
+        np.testing.assert_equal(Iref, Inew)
+
+    def test_serde_rabitq(self):
+        self.do_test_serde("RaBitQ")
+
+
+class TestIVFRaBitQ(unittest.TestCase):
+    def test_comparison_vs_ref_L2(self):
+        ds = datasets.SyntheticDataset(128, 4096, 4096, 100)
+
+        k = 10
+        nlist = 200
+        ref_rbq = ReferenceIVFRabitQ(ds.d, nlist, Bq=4)
+        ref_rbq.train(ds.get_train(), np.identity(ds.d))
+        ref_rbq.add(ds.get_database())
+
+        index_flat = faiss.IndexFlat(ds.d, faiss.METRIC_L2)
+        index_rbq = faiss.IndexIVFRaBitQ(
+            index_flat, ds.d, nlist, faiss.METRIC_L2
+        )
+        index_rbq.qb = 4
+        index_rbq.train(ds.get_train())
+        index_rbq.add(ds.get_database())
+
+        for nprobe in 1, 4, 16:
+            ref_rbq.nprobe = nprobe
+            Dref, Iref = ref_rbq.search(ds.get_queries(), k)
+            r_ref_k = faiss.eval_intersection(
+                Iref[:, :k], ds.get_groundtruth()[:, :k]
+            ) / (ds.nq * k)
+            print(f"{nprobe=} k-recall@10={r_ref_k}")
+
+            params = faiss.IVFRaBitQSearchParameters()
+            params.qb = index_rbq.qb
+            params.nprobe = nprobe
+            _, Inew, _ = faiss.search_with_parameters(
+                index_rbq, ds.get_queries(), k, params, output_stats=True
+            )
+            r_new_k = faiss.eval_intersection(
+                Inew[:, :k], ds.get_groundtruth()[:, :k]
+            ) / (ds.nq * k)
+            print(f"{nprobe=} k-recall@10={r_new_k}")
+
+            np.testing.assert_almost_equal(r_ref_k, r_new_k, 3)
+
+    def test_comparison_vs_ref_L2_rrot(self):
+        ds = datasets.SyntheticDataset(128, 4096, 4096, 100)
+
+        k = 10
+        nlist = 200
+        rrot_seed = 123
+
+        ref_rbq = ReferenceIVFRabitQ(ds.d, nlist, Bq=4)
+        ref_rbq.train(ds.get_train(), random_rotation(ds.d, rrot_seed))
+        ref_rbq.add(ds.get_database())
+
+        index_flat = faiss.IndexFlat(ds.d, faiss.METRIC_L2)
+        index_rbq = faiss.IndexIVFRaBitQ(
+            index_flat, ds.d, nlist, faiss.METRIC_L2
+        )
+        index_rbq.qb = 4
+
+        # wrap with random rotations
+        rrot = faiss.RandomRotationMatrix(ds.d, ds.d)
+        rrot.init(rrot_seed)
+
+        index_cand = faiss.IndexPreTransform(rrot, index_rbq)
+        index_cand.train(ds.get_train())
+        index_cand.add(ds.get_database())
+
+        for nprobe in 1, 4, 16:
+            ref_rbq.nprobe = nprobe
+            Dref, Iref = ref_rbq.search(ds.get_queries(), k)
+            r_ref_k = faiss.eval_intersection(
+                Iref[:, :k], ds.get_groundtruth()[:, :k]
+            ) / (ds.nq * k)
+            print(f"{nprobe=} k-recall@10={r_ref_k}")
+
+            params = faiss.IVFRaBitQSearchParameters()
+            params.qb = index_rbq.qb
+            params.nprobe = nprobe
+            Dnew, Inew, stats2 = faiss.search_with_parameters(
+                index_cand, ds.get_queries(), k, params, output_stats=True
+            )
+            r_new_k = faiss.eval_intersection(
+                Inew[:, :k], ds.get_groundtruth()[:, :k]
+            ) / (ds.nq * k)
+            print(f"{nprobe=} k-recall@10={r_new_k}")
+
+            np.testing.assert_almost_equal(r_ref_k, r_new_k, 2)
+
+    def do_test_serde(self, description):
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+
+        xt = ds.get_train()
+        xb = ds.get_database()
+
+        index = faiss.index_factory(ds.d, description)
+        index.train(xt)
+        index.add(xb)
+
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        b = faiss.serialize_index(index)
+        index2 = faiss.deserialize_index(b)
+
+        Dnew, Inew = index2.search(ds.get_queries(), 10)
+
+        np.testing.assert_equal(Dref, Dnew)
+        np.testing.assert_equal(Iref, Inew)
+
+    def test_serde_ivfrabitq(self):
+        self.do_test_serde("IVF16,RaBitQ")
+
+
+class TestRaBitQuantizerEncodeDecode(unittest.TestCase):
+    def do_test_encode_decode(self, d, metric):
+        # rabitq must precisely reconstruct a vector,
+        #   which consists of +A and -A values
+
+        seed = 123
+        rs = np.random.RandomState(seed)
+
+        ampl = 100
+        n = 10
+        vec = (2 * rs.randint(0, 2, d * n) - 1).astype(np.float32) * ampl
+        vec = np.reshape(vec, (n, d))
+
+        quantizer = faiss.RaBitQuantizer(d, metric)
+
+        # encode and decode
+        vec_q = quantizer.compute_codes(vec)
+        vec_rec = quantizer.decode(vec_q)
+
+        # verify
+        np.testing.assert_equal(vec, vec_rec)
+
+    def test_encode_decode_L2(self):
+        self.do_test_encode_decode(16, faiss.METRIC_L2)
+
+    def test_encode_decode_IP(self):
+        self.do_test_encode_decode(16, faiss.METRIC_INNER_PRODUCT)
diff --git a/tests/test_search_params.py b/tests/test_search_params.py
index 18436edf4d..56c2cd95ee 100644
--- a/tests/test_search_params.py
+++ b/tests/test_search_params.py
@@ -22,14 +22,38 @@ class TestSelector(unittest.TestCase):
     combinations as possible.
     """
 
-    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2, k=10):
+    def do_test_id_selector(
+        self,
+        index_key,
+        id_selector_type="batch",
+        mt=faiss.METRIC_L2,
+        k=10,
+        use_heap=True
+    ):
         """ Verify that the id selector returns the subset of results that are
         members according to the IDSelector.
         Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor"
         """
-        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
-        index = faiss.index_factory(ds.d, index_key, mt)
-        index.train(ds.get_train())
+        d = 32  # make sure dimension is multiple of 8 for binary
+        ds = datasets.SyntheticDataset(d, 1000, 100, 20)
+
+        if index_key == "BinaryFlat":
+            rs = np.random.RandomState(123)
+            xb = rs.randint(256, size=(ds.nb, d // 8), dtype='uint8')
+            xq = rs.randint(256, size=(ds.nq, d // 8), dtype='uint8')
+            index = faiss.IndexBinaryFlat(d)
+            index.use_heap = use_heap
+            # Use smaller radius for Hamming distance
+            base_radius = 4
+            is_binary = True
+        else:
+            xb = ds.get_database()
+            xq = ds.get_queries()
+            xt = ds.get_train()
+            index = faiss.index_factory(d, index_key, mt)
+            index.train(xt)
+            base_radius = float('inf')  # Will be set based on results
+            is_binary = False
 
         # reference result
         if "range" in id_selector_type:
@@ -54,20 +78,22 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR
             subset = np.setxor1d(lhs_subset, rhs_subset)
         else:
             rs = np.random.RandomState(123)
-            subset = rs.choice(ds.nb, 50, replace=False).astype("int64")
-        # add_with_ids not supported for all index types
-        # index.add_with_ids(ds.get_database()[subset], subset)
-        index.add(ds.get_database()[subset])
+            subset = rs.choice(ds.nb, 50, replace=False).astype('int64')
+
+        index.add(xb[subset])
         if "IVF" in index_key and id_selector_type == "range_sorted":
             self.assertTrue(index.check_ids_sorted())
-        Dref, Iref0 = index.search(ds.get_queries(), k)
+        Dref, Iref0 = index.search(xq, k)
         Iref = subset[Iref0]
         Iref[Iref0 < 0] = -1
 
-        radius = float(Dref[Iref > 0].max()) * 1.01
+        if base_radius == float('inf'):
+            radius = float(Dref[Iref > 0].max()) * 1.01
+        else:
+            radius = base_radius
+
         try:
-            Rlims_ref, RDref, RIref = index.range_search(
-                ds.get_queries(), radius)
+            Rlims_ref, RDref, RIref = index.range_search(xq, radius)
         except RuntimeError as e:
             if "not implemented" in str(e):
                 have_range_search = False
@@ -81,7 +107,7 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR
 
         # result with selector: fill full database and search with selector
         index.reset()
-        index.add(ds.get_database())
+        index.add(xb)
         if id_selector_type == "range":
             sel = faiss.IDSelectorRange(30, 80)
         elif id_selector_type == "range_sorted":
@@ -123,17 +149,53 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR
             faiss.SearchParametersPQ(sel=sel) if "PQ" in index_key else
             faiss.SearchParameters(sel=sel)
         )
-        Dnew, Inew = index.search(ds.get_queries(), k, params=params)
-        np.testing.assert_array_equal(Iref, Inew)
-        np.testing.assert_almost_equal(Dref, Dnew, decimal=5)
+
+        Dnew, Inew = index.search(xq, k, params=params)
+
+        if is_binary:
+            # For binary indexes, we need to check:
+            # 1. All returned IDs are valid (in the subset or -1)
+            # 2. The distances match
+
+            # Check that all returned IDs are valid
+            valid_ids = np.ones_like(Inew, dtype=bool)
+            # Create a mask of valid IDs (those in subset)
+            subset_set = set(subset)  # Convert to set for O(1) lookups
+            # Handle -1 values separately (they're always valid)
+            valid_ids = np.logical_or(
+                Inew == -1,
+                np.isin(Inew, list(subset_set))
+            )
+
+            self.assertTrue(np.all(valid_ids), "Some returned IDs are not in the subset")
+
+            # Check that distances match
+            np.testing.assert_almost_equal(Dref, Dnew, decimal=5)
+        else:
+            # For non-binary indexes, we can do exact comparison
+            np.testing.assert_array_equal(Iref, Inew)
+            np.testing.assert_almost_equal(Dref, Dnew, decimal=5)
 
         if have_range_search:
-            Rlims_new, RDnew, RInew = index.range_search(
-                ds.get_queries(), radius, params=params)
+            Rlims_new, RDnew, RInew = index.range_search(xq, radius, params=params)
             np.testing.assert_array_equal(Rlims_ref, Rlims_new)
             RDref, RIref = sort_range_res_2(Rlims_ref, RDref, RIref)
-            np.testing.assert_array_equal(RIref, RInew)
-            np.testing.assert_almost_equal(RDref, RDnew, decimal=5)
+
+            if is_binary:
+                # For binary indexes, check that all returned IDs are valid
+                valid_ids = np.ones(len(RInew), dtype=bool)
+                # Use vectorized operation instead of loop
+                subset_set = set(subset)  # Convert to set for O(1) lookups
+                valid_ids = np.isin(RInew, list(subset_set))
+
+                self.assertTrue(np.all(valid_ids), "Some range search IDs are not in the subset")
+
+                # Check that distances match
+                np.testing.assert_almost_equal(RDref, RDnew, decimal=5)
+            else:
+                # For non-binary indexes, we can do exact comparison
+                np.testing.assert_array_equal(RIref, RInew)
+                np.testing.assert_almost_equal(RDref, RDnew, decimal=5)
 
     def test_IVFFlat(self):
         self.do_test_id_selector("IVF32,Flat")
@@ -284,6 +346,17 @@ def test_bounds(self):
         distances, indices = index_ip.search(xb[:2], k=3, params=search_params)
         distances, indices = index_l2.search(xb[:2], k=3, params=search_params)
 
+    def test_BinaryFlat(self):
+        self.do_test_id_selector("BinaryFlat")
+
+    def test_BinaryFlat_id_range(self):
+        self.do_test_id_selector("BinaryFlat", id_selector_type="range")
+
+    def test_BinaryFlat_id_array(self):
+        self.do_test_id_selector("BinaryFlat", id_selector_type="array")
+
+    def test_BinaryFlat_no_heap(self):
+        self.do_test_id_selector("BinaryFlat", use_heap=False)
 
 class TestSearchParams(unittest.TestCase):
 
diff --git a/tests/test_zerocopy.cpp b/tests/test_zerocopy.cpp
new file mode 100644
index 0000000000..9b8734bd53
--- /dev/null
+++ b/tests/test_zerocopy.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/io.h>
+#include <faiss/impl/zerocopy_io.h>
+#include <faiss/index_io.h>
+
+namespace {
+
+std::vector<float> make_data(const size_t n, const size_t d, size_t seed) {
+    std::vector<float> database(n * d);
+    std::mt19937 rng(seed);
+    std::uniform_real_distribution<float> distrib;
+
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = distrib(rng);
+    }
+    return database;
+}
+
+std::vector<uint8_t> make_binary_data(
+        const size_t n,
+        const size_t d,
+        size_t seed) {
+    std::vector<uint8_t> database(n * d);
+    std::mt19937 rng(seed);
+    std::uniform_int_distribution<uint8_t> distrib(0, 255);
+
+    for (size_t i = 0; i < n * d; i++) {
+        database[i] = distrib(rng);
+    }
+    return database;
+}
+
+} // namespace
+
+// the logic is the following:
+//   1. generate two flatcodes-based indices, Index1 and Index2
+//   2. serialize both indices into std::vector<> buffers, Buf1 and Buf2
+//   3. deserialize Index1 using zero-copy feature on Buf1 into Index1ZC
+//   4. ensure that Index1ZC acts as Index2 if we write the data from Buf2
+//      on top of the existing Buf1
+
+TEST(TestZeroCopy, zerocopy_flatcodes) {
+    // generate data
+    const size_t nt = 1000;
+    const size_t nq = 10;
+    const size_t d = 32;
+    const size_t k = 25;
+
+    std::vector<float> xt1 = make_data(nt, d, 123);
+    std::vector<float> xt2 = make_data(nt, d, 456);
+    std::vector<float> xq = make_data(nq, d, 789);
+
+    // ensure that the data is different
+    ASSERT_NE(xt1, xt2);
+
+    // make index1 and create reference results
+    faiss::IndexFlatL2 index1(d);
+    index1.train(nt, xt1.data());
+    index1.add(nt, xt1.data());
+
+    std::vector<float> ref_dis_1(k * nq);
+    std::vector<faiss::idx_t> ref_ids_1(k * nq);
+    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
+
+    // make index2 and create reference results
+    faiss::IndexFlatL2 index2(d);
+    index2.train(nt, xt2.data());
+    index2.add(nt, xt2.data());
+
+    std::vector<float> ref_dis_2(k * nq);
+    std::vector<faiss::idx_t> ref_ids_2(k * nq);
+    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
+
+    // ensure that the results are different
+    ASSERT_NE(ref_dis_1, ref_dis_2);
+    ASSERT_NE(ref_ids_1, ref_ids_2);
+
+    // serialize both in a form of vectors
+    faiss::VectorIOWriter wr1;
+    faiss::write_index(&index1, &wr1);
+
+    faiss::VectorIOWriter wr2;
+    faiss::write_index(&index2, &wr2);
+
+    ASSERT_EQ(wr1.data.size(), wr2.data.size());
+
+    // clone a buffer
+    std::vector<uint8_t> buffer = wr1.data;
+
+    // create a zero-copy index
+    faiss::ZeroCopyIOReader reader(buffer.data(), buffer.size());
+    std::unique_ptr<faiss::Index> index1zc(faiss::read_index(&reader));
+
+    ASSERT_NE(index1zc, nullptr);
+
+    // perform a search
+    std::vector<float> cand_dis_1(k * nq);
+    std::vector<faiss::idx_t> cand_ids_1(k * nq);
+    index1zc->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_1);
+    ASSERT_EQ(ref_dis_1, cand_dis_1);
+
+    // overwrite buffer without moving it
+    for (size_t i = 0; i < buffer.size(); i++) {
+        buffer[i] = wr2.data[i];
+    }
+
+    // perform a search
+    std::vector<float> cand_dis_2(k * nq);
+    std::vector<faiss::idx_t> cand_ids_2(k * nq);
+    index1zc->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
+
+    // match vs ref2
+    ASSERT_EQ(ref_ids_2, cand_ids_2);
+    ASSERT_EQ(ref_dis_2, cand_dis_2);
+
+    // overwrite again
+    for (size_t i = 0; i < buffer.size(); i++) {
+        buffer[i] = wr1.data[i];
+    }
+
+    // perform a search
+    std::vector<float> cand_dis_3(k * nq);
+    std::vector<faiss::idx_t> cand_ids_3(k * nq);
+    index1zc->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_3);
+    ASSERT_EQ(ref_dis_1, cand_dis_3);
+}
+
+TEST(TestZeroCopy, zerocopy_binary_flatcodes) {
+    // generate data
+    const size_t nt = 1000;
+    const size_t nq = 10;
+    // in bits
+    const size_t d = 64;
+    // in bytes
+    const size_t d8 = (d + 7) / 8;
+    const size_t k = 25;
+
+    std::vector<uint8_t> xt1 = make_binary_data(nt, d8, 123);
+    std::vector<uint8_t> xt2 = make_binary_data(nt, d8, 456);
+    std::vector<uint8_t> xq = make_binary_data(nq, d8, 789);
+
+    // ensure that the data is different
+    ASSERT_NE(xt1, xt2);
+
+    // make index1 and create reference results
+    faiss::IndexBinaryFlat index1(d);
+    index1.train(nt, xt1.data());
+    index1.add(nt, xt1.data());
+
+    std::vector<int32_t> ref_dis_1(k * nq);
+    std::vector<faiss::idx_t> ref_ids_1(k * nq);
+    index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
+
+    // make index2 and create reference results
+    faiss::IndexBinaryFlat index2(d);
+    index2.train(nt, xt2.data());
+    index2.add(nt, xt2.data());
+
+    std::vector<int32_t> ref_dis_2(k * nq);
+    std::vector<faiss::idx_t> ref_ids_2(k * nq);
+    index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
+
+    // ensure that the results are different
+    ASSERT_NE(ref_dis_1, ref_dis_2);
+    ASSERT_NE(ref_ids_1, ref_ids_2);
+
+    // serialize both in a form of vectors
+    faiss::VectorIOWriter wr1;
+    faiss::write_index_binary(&index1, &wr1);
+
+    faiss::VectorIOWriter wr2;
+    faiss::write_index_binary(&index2, &wr2);
+
+    ASSERT_EQ(wr1.data.size(), wr2.data.size());
+
+    // clone a buffer
+    std::vector<uint8_t> buffer = wr1.data;
+
+    // create a zero-copy index
+    faiss::ZeroCopyIOReader reader(buffer.data(), buffer.size());
+    std::unique_ptr<faiss::IndexBinary> index1zc(
+            faiss::read_index_binary(&reader));
+
+    ASSERT_NE(index1zc, nullptr);
+
+    // perform a search
+    std::vector<int32_t> cand_dis_1(k * nq);
+    std::vector<faiss::idx_t> cand_ids_1(k * nq);
+    index1zc->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_1);
+    ASSERT_EQ(ref_dis_1, cand_dis_1);
+
+    // overwrite buffer without moving it
+    for (size_t i = 0; i < buffer.size(); i++) {
+        buffer[i] = wr2.data[i];
+    }
+
+    // perform a search
+    std::vector<int32_t> cand_dis_2(k * nq);
+    std::vector<faiss::idx_t> cand_ids_2(k * nq);
+    index1zc->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
+
+    // match vs ref2
+    ASSERT_EQ(ref_ids_2, cand_ids_2);
+    ASSERT_EQ(ref_dis_2, cand_dis_2);
+
+    // overwrite again
+    for (size_t i = 0; i < buffer.size(); i++) {
+        buffer[i] = wr1.data[i];
+    }
+
+    // perform a search
+    std::vector<int32_t> cand_dis_3(k * nq);
+    std::vector<faiss::idx_t> cand_ids_3(k * nq);
+    index1zc->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
+
+    // match vs ref1
+    ASSERT_EQ(ref_ids_1, cand_ids_3);
+    ASSERT_EQ(ref_dis_1, cand_dis_3);
+}